panproto_parse/emit_pretty/
grammar.rs

1#![allow(
2    clippy::module_name_repetitions,
3    clippy::too_many_lines,
4    clippy::too_many_arguments,
5    clippy::map_unwrap_or,
6    clippy::option_if_let_else,
7    clippy::elidable_lifetime_names,
8    clippy::items_after_statements,
9    clippy::needless_pass_by_value,
10    clippy::single_match_else,
11    clippy::manual_let_else,
12    clippy::match_same_arms,
13    clippy::missing_const_for_fn,
14    clippy::single_char_pattern,
15    clippy::naive_bytecount,
16    clippy::expect_used,
17    clippy::redundant_pub_crate,
18    clippy::used_underscore_binding,
19    clippy::redundant_field_names,
20    clippy::struct_field_names,
21    clippy::redundant_else,
22    clippy::similar_names
23)]
24
25//! `emit_pretty::grammar` (Phase A decomposition).
26
27use super::{
28    BTreeMap, Deserialize, ParseError, collect_all_symbol_refs, external_symbol_name,
29    extract_line_comment_prefix, has_repeat_recursive, is_immediate_token, is_newline_like_pattern,
30    is_prefix_sigil, is_quote_delimiter, is_rest_of_line_pattern, is_whitespace_only_pattern,
31    is_word_like, kind_satisfies_symbol, leading_optional_sign, literal_strings,
32    matching_close_bracket, pattern_absorbs_leading_space, referenced_symbols, terminal_pattern_of,
33    unwrap_to_seq, unwrap_to_string,
34};
35
36// ═══════════════════════════════════════════════════════════════════
37// Grammar JSON model
38// ═══════════════════════════════════════════════════════════════════
39
40/// A single tree-sitter production rule.
41///
42/// Mirrors the shape emitted by `tree-sitter generate`: every node has
43/// a `type` discriminator that selects a structural variant. The
44/// untyped subset (`PATTERN`, `STRING`, `SYMBOL`, `BLANK`) handles
45/// terminals; the structural subset (`SEQ`, `CHOICE`, `REPEAT`,
46/// `REPEAT1`, `OPTIONAL`, `FIELD`, `ALIAS`, `TOKEN`,
47/// `IMMEDIATE_TOKEN`, `PREC*`) builds composite productions.
48#[derive(Debug, Clone, Deserialize)]
49#[serde(tag = "type")]
50#[non_exhaustive]
51pub enum Production {
52    /// Concatenation of productions.
53    #[serde(rename = "SEQ")]
54    Seq {
55        /// Ordered members; each is emitted in turn.
56        members: Vec<Self>,
57    },
58    /// Alternation between productions.
59    #[serde(rename = "CHOICE")]
60    Choice {
61        /// Alternatives; the walker picks one based on the schema's
62        /// children and constraints.
63        members: Vec<Self>,
64    },
65    /// Zero-or-more repetition.
66    #[serde(rename = "REPEAT")]
67    Repeat {
68        /// The repeated body.
69        content: Box<Self>,
70    },
71    /// One-or-more repetition.
72    #[serde(rename = "REPEAT1")]
73    Repeat1 {
74        /// The repeated body.
75        content: Box<Self>,
76    },
77    /// Optional inclusion (zero or one).
78    ///
79    /// Tree-sitter usually emits `OPTIONAL` as `CHOICE { content,
80    /// BLANK }`, but recent generator versions also emit explicit
81    /// `OPTIONAL` nodes; both shapes are accepted.
82    #[serde(rename = "OPTIONAL")]
83    Optional {
84        /// The optional body.
85        content: Box<Self>,
86    },
87    /// Reference to another rule by name.
88    #[serde(rename = "SYMBOL")]
89    Symbol {
90        /// Name of the referenced rule (matches a vertex kind on the
91        /// schema side).
92        name: String,
93    },
94    /// Literal token bytes.
95    #[serde(rename = "STRING")]
96    String {
97        /// The literal token. Emitted verbatim.
98        value: String,
99    },
100    /// Regex-matched terminal.
101    ///
102    /// At parse time this matches arbitrary bytes; at emit time the
103    /// walker substitutes a `literal-value` constraint when present
104    /// and falls back to a placeholder otherwise.
105    #[serde(rename = "PATTERN")]
106    Pattern {
107        /// The original regex.
108        value: String,
109    },
110    /// The empty production. Emits nothing.
111    #[serde(rename = "BLANK")]
112    Blank,
113    /// Named field over a content production.
114    ///
115    /// The field `name` matches an edge kind on the schema side; the
116    /// walker resolves the corresponding child vertex and recurses
117    /// into `content` with that child as context.
118    #[serde(rename = "FIELD")]
119    Field {
120        /// Field name (matches edge kind).
121        name: String,
122        /// The contents of the field.
123        content: Box<Self>,
124    },
125    /// An aliased production.
126    ///
127    /// `value` records the parser-visible kind; the walker emits
128    /// `content` and ignores the alias rename.
129    #[serde(rename = "ALIAS")]
130    Alias {
131        /// The aliased content.
132        content: Box<Self>,
133        /// Whether the alias is a named node.
134        #[serde(default)]
135        named: bool,
136        /// The alias's surface name.
137        #[serde(default)]
138        value: String,
139    },
140    /// A token wrapper.
141    ///
142    /// Tree-sitter uses `TOKEN` to mark a sub-rule as a single
143    /// lexical token; the walker emits the inner content unchanged.
144    #[serde(rename = "TOKEN")]
145    Token {
146        /// The wrapped content.
147        content: Box<Self>,
148    },
149    /// An immediate-token wrapper (no preceding whitespace).
150    ///
151    /// Treated like [`Production::Token`] for emit purposes.
152    #[serde(rename = "IMMEDIATE_TOKEN")]
153    ImmediateToken {
154        /// The wrapped content.
155        content: Box<Self>,
156    },
157    /// Precedence wrapper.
158    #[serde(rename = "PREC")]
159    Prec {
160        /// Precedence value (numeric or string). Ignored at emit time.
161        #[allow(dead_code)]
162        value: serde_json::Value,
163        /// The wrapped content.
164        content: Box<Self>,
165    },
166    /// Left-associative precedence wrapper.
167    #[serde(rename = "PREC_LEFT")]
168    PrecLeft {
169        /// Precedence value. Ignored at emit time.
170        #[allow(dead_code)]
171        value: serde_json::Value,
172        /// The wrapped content.
173        content: Box<Self>,
174    },
175    /// Right-associative precedence wrapper.
176    #[serde(rename = "PREC_RIGHT")]
177    PrecRight {
178        /// Precedence value. Ignored at emit time.
179        #[allow(dead_code)]
180        value: serde_json::Value,
181        /// The wrapped content.
182        content: Box<Self>,
183    },
184    /// Dynamic precedence wrapper.
185    #[serde(rename = "PREC_DYNAMIC")]
186    PrecDynamic {
187        /// Precedence value. Ignored at emit time.
188        #[allow(dead_code)]
189        value: serde_json::Value,
190        /// The wrapped content.
191        content: Box<Self>,
192    },
193    /// Reserved-word wrapper (tree-sitter ≥ 0.25).
194    ///
195    /// Tree-sitter's `RESERVED` rule marks an inner production as a
196    /// reserved-word context: the parser excludes the listed identifiers
197    /// from being treated as the inner symbol. The `context_name`
198    /// metadata names the reserved-word set; the emitter does not need
199    /// it (we are walking schema → bytes, not enforcing reserved-word
200    /// constraints), so we emit the inner content unchanged, the same
201    /// way [`Production::Token`] and [`Production::ImmediateToken`] do.
202    #[serde(rename = "RESERVED")]
203    Reserved {
204        /// The wrapped content.
205        content: Box<Self>,
206        /// Name of the reserved-word context. Ignored at emit time.
207        #[allow(dead_code)]
208        #[serde(default)]
209        context_name: String,
210    },
211}
212
213/// Structural role of a STRING token within a grammar rule.
214///
215/// Derived at Grammar construction time from the token's position in
216/// the production rule body. The role determines spacing behavior in
217/// the layout pass via a role-pair lookup table.
218#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
219pub enum TokenRole {
220    /// First STRING in a matched-pair SEQ (e.g. `(`, `[`, `{`, `<`,
221    /// `begin`, `${`, `⟨`). No space after.
222    BracketOpen,
223    /// Last STRING in a matched-pair SEQ (e.g. `)`, `]`, `}`, `>`,
224    /// `end`, `⟩`). No space before.
225    BracketClose,
226    /// First STRING in a REPEAT body's inner SEQ (e.g. `,` in
227    /// `REPEAT(SEQ [",", item])`). No space before, space after.
228    Separator,
229    /// Alphanumeric STRING that is a language keyword (e.g. `if`,
230    /// `while`, `and`, `model`). Space before and after.
231    Keyword,
232    /// Non-alphanumeric STRING between content members inside a CHOICE
233    /// alternative (e.g. `+`, `=`, `~`, `<-` in binary expression
234    /// alternatives). Space before and after.
235    Operator,
236    /// Non-alphanumeric STRING between content members in a standalone
237    /// SEQ (not inside a CHOICE). Examples: `.` in `attribute`,
238    /// `::` in `scoped_identifier`, `->` in `pointer_member`. These
239    /// are structural connectors, not algebraic operators. No space.
240    Connector,
241    /// Text from a leaf vertex's `literal-value` constraint.
242    Terminal,
243    /// A token the grammar wraps in `IMMEDIATE_TOKEN`: the lexer emits it
244    /// glued to its neighbour with no intervening whitespace (the `.` in
245    /// a float literal `0.5`, an immediate string delimiter). Tight on
246    /// both sides, unconditionally. Mirrors
247    /// [`panproto_gat::LayoutRole::Immediate`].
248    Immediate,
249}
250
251/// A grammar's production-rule table, deserialized from `grammar.json`.
252///
253/// Only the fields the emitter consumes are decoded; precedences,
254/// conflicts, externals, and other parser-only metadata are ignored.
255#[derive(Debug, Clone, Deserialize)]
256#[non_exhaustive]
257pub struct Grammar {
258    /// Grammar name (e.g. `"rust"`, `"typescript"`).
259    #[allow(dead_code)]
260    pub name: String,
261    /// The grammar's start symbol: the first rule as written in
262    /// grammar.json (tree-sitter's entry point). Recovered from the raw
263    /// bytes because [`rules`](Self::rules) is a `BTreeMap` that loses the
264    /// original insertion order.
265    #[serde(skip)]
266    pub start_symbol: String,
267    /// Cached least-fixpoint of the per-hidden-rule minimum mandated child
268    /// count (see `rule_min_required_children`). The table depends only on the
269    /// grammar, so it is resolved once on first use and reused: recomputing the
270    /// whole-grammar fixpoint on every emit-dispatch decision was an O(rules)
271    /// tax per decision, and on yaml (202 rules in one mutually-recursive SCC)
272    /// it dominated emit time. `OnceLock` gives interior-mutability caching
273    /// behind `&Grammar`; `#[serde(skip)]` leaves it empty on deserialize so it
274    /// fills lazily regardless of construction path.
275    #[serde(skip)]
276    pub(crate) min_children: std::sync::OnceLock<std::collections::HashMap<String, usize>>,
277    /// Map from rule name (a vertex kind on the schema side) to
278    /// production. Entries are kept in lexical order so iteration
279    /// is deterministic.
280    pub rules: BTreeMap<String, Production>,
281    /// Supertypes declared in the grammar's `supertypes` field. A
282    /// supertype is a rule whose body is a `CHOICE` of `SYMBOL`
283    /// references; tree-sitter parsers report a node's kind as one
284    /// of the subtypes (e.g. `identifier`, `typed_parameter`) rather
285    /// than the supertype name (`parameter`), so the emitter needs to
286    /// know that a child kind in a subtype set should match the
287    /// supertype name when a SYMBOL references it.
288    #[serde(default, deserialize_with = "deserialize_supertypes")]
289    pub supertypes: std::collections::HashSet<String>,
290    /// Tree-sitter `extras` rules: the named symbols (typically comments)
291    /// that tree-sitter skips at parse time but records as children of the
292    /// surrounding vertex. They appear nowhere in the production grammar,
293    /// so the rule walker cannot reconcile them against the cursor — the
294    /// emit pass therefore drains them as a side channel: at vertex entry
295    /// and between REPEAT iterations any leading extras-kind edges are
296    /// consumed and emitted directly. The set is populated at
297    /// `Grammar::from_bytes` by collecting every `SYMBOL { name }` and
298    /// named `ALIAS { value, named: true }` under the top-level `extras`
299    /// array. Pattern-only extras (e.g. `\s` whitespace) are not vertex
300    /// kinds and are excluded.
301    #[serde(default, deserialize_with = "deserialize_extras")]
302    pub extras: std::collections::HashSet<String>,
303    /// Tree-sitter `inline` rules: named rules the generator splices into
304    /// every referencing production rather than emitting as their own
305    /// node. An inlined rule's children (its FIELDs and bare SYMBOL
306    /// members) are promoted to be direct children of the referencing
307    /// vertex, so on the schema side there is no child vertex of the
308    /// inlined rule's kind. When the emit walk hits a `SYMBOL` member
309    /// naming an inlined rule it must therefore expand that rule's body
310    /// inline against the current cursor (the same treatment a hidden
311    /// `_`-prefixed rule gets), or the inlined members' edges are dropped
312    /// (brightscript `sub_impl`/`function_impl` drop `parameters`/`body`/
313    /// `end_statement`). Populated from grammar.json's top-level `inline`
314    /// array.
315    #[serde(
316        rename = "inline",
317        default,
318        deserialize_with = "deserialize_supertypes"
319    )]
320    pub inline_rules: std::collections::HashSet<String>,
321    /// Precomputed subtyping closure: `subtypes[symbol_name]` is the
322    /// set of vertex kinds that satisfy a SYMBOL `symbol_name`
323    /// reference on the schema side.
324    ///
325    /// Built once at [`Grammar::from_bytes`] time by walking each
326    /// hidden rule (`_`-prefixed), declared supertype, and named
327    /// `ALIAS { value: K, ... }` production to its leaf SYMBOLs and
328    /// recording the closure. This replaces the prior heuristic
329    /// `kind_satisfies_symbol` that walked the rule body on every
330    /// query: lookups are now O(1) and the relation is exactly the
331    /// transitive closure of "is reachable via hidden / supertype /
332    /// alias dispatch", with no over-expansion through non-hidden
333    /// non-supertype rule references.
334    #[serde(skip)]
335    pub subtypes: std::collections::HashMap<String, std::collections::HashSet<String>>,
336    /// Precomputed Yield sets: `yield_sets[rule_name]` is the set of
337    /// concrete vertex kinds that can appear as the **first named
338    /// child** when that rule's production is taken.
339    ///
340    /// Defined inductively:
341    /// - `Yield(SYMBOL S)` where S is hidden/supertype = `Yield(rules[S])`
342    /// - `Yield(SYMBOL S)` where S is concrete = `{S}`
343    /// - `Yield(SEQ [M1, ...])` = `Yield(M1)` (only first member)
344    /// - `Yield(CHOICE [M1, ..., Mn])` = `⋃ Yield(Mi)`
345    /// - `Yield(OPTIONAL { c })` = `Yield(c) ∪ {ε}`
346    /// - `Yield(BLANK)` = `{ε}`
347    /// - Wrappers (PREC*, TOKEN, FIELD, REPEAT, etc.) = `Yield(content)`
348    /// - `Yield(STRING)` = `Yield(PATTERN)` = `∅`
349    /// - `Yield(ALIAS { value: V, named: true })` = `{V}`
350    ///
351    /// Epsilon is represented as the empty string `""`.
352    #[serde(skip)]
353    pub yield_sets: std::collections::HashMap<String, std::collections::HashSet<String>>,
354    /// Child kinds allowed per parent kind, derived from node-types.json.
355    /// Maps parent kind to the set of ALL named child kinds that tree-sitter's
356    /// parser can produce for that parent (from both `children.types` and
357    /// `fields.*.types`). Used by `augment_subtypes_from_node_types` to
358    /// close the grammar/parser divergence gap.
359    #[serde(skip)]
360    pub node_type_children: std::collections::HashMap<String, std::collections::HashSet<String>>,
361    /// Per-field child kinds from node-types.json: maps parent kind →
362    /// field name → set of child kinds. Used by the augmentation to
363    /// restrict subtype edges to structurally matching positions.
364    #[serde(skip)]
365    pub node_type_field_children: std::collections::HashMap<
366        String,
367        std::collections::HashMap<String, std::collections::HashSet<String>>,
368    >,
369    /// Non-field child kinds from node-types.json: maps parent kind →
370    /// set of child kinds that appear in `children.types` (not in any field).
371    #[serde(skip)]
372    pub node_type_nonfield_children:
373        std::collections::HashMap<String, std::collections::HashSet<String>>,
374    /// Anonymous ALIAS values for external scanner tokens. Maps external
375    /// symbol name (e.g. `_ternary_qmark`) to the ALIAS value string
376    /// (e.g. `"?"`). Built by scanning grammar.json rule bodies for
377    /// `ALIAS { content: SYMBOL S, named: false, value: V }` where S
378    /// has no grammar rule.
379    #[serde(skip)]
380    pub external_alias_map: std::collections::HashMap<String, String>,
381    /// Per-rule token role classification. Maps rule name to a map of
382    /// STRING value to its structural role in that rule. Derived at
383    /// construction time by analyzing each rule's SEQ structure to
384    /// identify bracket pairs, separators, keywords, and operators.
385    #[serde(skip)]
386    pub token_roles:
387        std::collections::HashMap<String, std::collections::HashMap<String, TokenRole>>,
388    /// Set of `(rule_name, open_bracket_value)` pairs where the bracket
389    /// triggers indentation (the content between open and close contains
390    /// `REPEAT`/`REPEAT1`). Block-level constructs like `statement_block`
391    /// use indenting brackets; inline constructs like interpolation do not.
392    #[serde(skip)]
393    pub indent_triggers: std::collections::HashSet<(String, String)>,
394    /// Line-comment prefixes extracted from the grammar's extras.
395    /// Each prefix is a STRING value from a `TOKEN(SEQ [STRING prefix,
396    /// PATTERN ...])` pattern in the extras array, verified to be an
397    /// extras rule. Used by the layout pass to insert a newline after
398    /// comment Lit tokens.
399    #[serde(skip)]
400    pub line_comment_prefixes: Vec<String>,
401    /// Bare literal markers that, when emitted as the final token of the
402    /// output, must NOT be followed by the customary end-of-output newline.
403    ///
404    /// Derived from productions of the shape `SEQ[CHOICE[.. bare lit ..],
405    /// <newline-leading>]` — tree-sitter's "hard line break" idiom
406    /// (`markdown_inline`'s `hard_line_break = SEQ[CHOICE["\\" |
407    /// _whitespace_ge_2], _soft_line_break]`). A trailing backslash (or
408    /// trailing whitespace, see [`trailing_break_on_whitespace`]) is plain
409    /// content on its own; only a following newline turns it into a
410    /// line-break node. The end-of-output newline the layout fold appends
411    /// would therefore manufacture a phantom break node on re-parse, so it
412    /// is suppressed when the output ends with one of these markers.
413    ///
414    /// Restricted to SINGLE-character non-alphanumeric literals so the rule
415    /// fires only on genuine standalone break markers (`\`), never on
416    /// keyword/identifier-led line constructs (`posting`, `declaration`,
417    /// `go_directive`) whose leading literal is substantive content.
418    ///
419    /// [`trailing_break_on_whitespace`]: Self::trailing_break_on_whitespace
420    #[serde(skip)]
421    pub trailing_break_markers: Vec<String>,
422    /// Whether the grammar has a hard-line-break production whose leading
423    /// alternative is a whitespace-only PATTERN (`markdown_inline`'s
424    /// `_whitespace_ge_2`). When set, a final emitted token ending in
425    /// trailing spaces/tabs also suppresses the end-of-output newline.
426    #[serde(skip)]
427    pub trailing_break_on_whitespace: bool,
428    /// Whether the grammar's top-level document repeat directly admits a
429    /// free-text content node whose pattern matches a bare newline
430    /// (template / markup grammars: `liquid`'s `template_content =
431    /// REPEAT1([^{]+ | ...)`, `twig`'s `content`, `eex`'s `text`). For such
432    /// grammars a lone trailing newline appended at end of output is
433    /// captured as an extra content node on re-parse, inflating the
434    /// kind-multiset, so the end-of-output newline is suppressed.
435    ///
436    /// Derived narrowly: the content rule must be a DIRECT child of the
437    /// start symbol's top-level REPEAT (through hidden symbols / CHOICE),
438    /// so the rule fires only on genuine document text, never on the
439    /// newline-admitting negated classes inside comments or string
440    /// fragments (which are nested under delimiters, not document nodes).
441    #[serde(skip)]
442    pub top_level_text_admits_newline: bool,
443    /// External tokens that produce indent-open layout actions.
444    /// Identified by tree-sitter naming convention: names ending with
445    /// `_indent` or equal to `_indent`.
446    #[serde(skip)]
447    pub external_indent_opens: std::collections::HashSet<String>,
448    /// External tokens that produce indent-close layout actions.
449    #[serde(skip)]
450    pub external_indent_closes: std::collections::HashSet<String>,
451    /// External tokens that produce line breaks.
452    #[serde(skip)]
453    pub external_newlines: std::collections::HashSet<String>,
454    /// External tokens equivalent to semicolons.
455    #[serde(skip)]
456    pub external_semicolons: std::collections::HashSet<String>,
457    /// External scanner tokens that open a delimiter pair around content
458    /// (e.g. `string_start` in `SEQ[string_start, REPEAT(content),
459    /// string_end]`). Derived structurally; emitted tight on the inside
460    /// (`'hello'`, not `' hello '`).
461    #[serde(skip)]
462    pub external_bracket_opens: std::collections::HashSet<String>,
463    /// External scanner tokens that close a delimiter pair around content
464    /// (e.g. `string_end`). Emitted tight on the inside.
465    #[serde(skip)]
466    pub external_bracket_closes: std::collections::HashSet<String>,
467    /// Visible (non-`_`-prefixed) external scanner tokens that are the
468    /// captured *content* between a pair of string/heredoc delimiters in a
469    /// `SEQ[open_ext, REPEAT(content..), close_ext]` rule (ruby
470    /// `string_content` / `heredoc_content`, regex content, command-string
471    /// content). Such a token's text IS the literal source bytes between the
472    /// delimiters: the layout pass must NOT insert a sibling-separation
473    /// space around it (`"bar"`, not `" bar "`), or a space folds into the
474    /// captured text on re-parse and accretes one space per emit. Derived
475    /// structurally from the same delimiter shape as
476    /// [`external_bracket_opens`](Self::external_bracket_opens).
477    #[serde(skip)]
478    pub external_content_kinds: std::collections::HashSet<String>,
479    /// Named content kinds that sit *between a matched pair of quote
480    /// delimiters* spelled as literal `STRING` tokens, in a rule shaped
481    /// `SEQ[STRING q, REPEAT(CHOICE[content..]), STRING q]` (the same
482    /// quote opens and closes). The CSS `string_value` and the C# / Java
483    /// `string_literal` are the canonical cases: the body is a `REPEAT`
484    /// over `CHOICE[string_content (an ALIAS over a PATTERN),
485    /// escape_sequence]`. Each such content / escape leaf carries the
486    /// verbatim source bytes and must emit *tight* on both sides
487    /// (`"ab\t"`, not `"ab \t "`), exactly like
488    /// [`external_content_kinds`](Self::external_content_kinds) but for the
489    /// STRING-delimited (rather than external-delimited) string shape that
490    /// `classify_external_bracket_delimiters` skips (it only matches
491    /// *external* delimiters). Derived purely from grammar structure (the
492    /// matched-literal-quote envelope), so it stays in the generic emitter.
493    #[serde(skip)]
494    pub string_content_kinds: std::collections::HashSet<String>,
495    /// Rule names that are indented blocks whose opening `_indent` lives
496    /// in a (hidden) parent rule rather than the rule itself: their body
497    /// references an external indent-*close* token (`_dedent`) but no
498    /// indent-*open* token. The parser reaches such a block vertex
499    /// directly (the hidden `_suite` wrapper carrying the `_indent` is
500    /// not a vertex), so the emitter must synthesize the opening indent
501    /// (`def f():` then an indented body) when it walks the rule.
502    #[serde(skip)]
503    pub synthetic_indent_rules: std::collections::HashSet<String>,
504    /// Named alias map: maps alias value to source symbol name.
505    /// When a vertex kind has no direct grammar rule, this map resolves
506    /// `ALIAS { content: SYMBOL source, named: true, value: alias }` so
507    /// the emitter can walk the source rule with proper token roles.
508    #[serde(skip)]
509    pub named_alias_map: std::collections::HashMap<String, String>,
510    /// Every source rule that aliases to a given kind, in grammar order.
511    /// A kind can be the `value` of several distinct `ALIAS` sites (cpp
512    /// `function_definition` is the alias value of `inline_method_definition`,
513    /// `constructor_or_destructor_definition`, `operator_cast_definition`, …,
514    /// AND has its own `function_definition` rule). When the vertex's own
515    /// rule cannot consume one of its child edges (a parser.c/grammar.json
516    /// desync where the collapsed-kind rule omits constructor-only members
517    /// like `field_initializer_list`), the emitter falls back to the alias
518    /// source whose production *does* admit the child set.
519    #[serde(skip)]
520    pub named_alias_sources: std::collections::HashMap<String, Vec<String>>,
521    /// Named terminal kinds whose underlying `PATTERN` can match a leading
522    /// space (e.g. INI's `setting_value = PATTERN ".+"`). A layout space
523    /// emitted *before* such a terminal would fold into its captured text
524    /// on re-parse and accrete one space per emit, so the emitter hugs them
525    /// to their predecessor. See `pattern_absorbs_leading_space`.
526    #[serde(skip)]
527    pub leading_space_terminals: std::collections::HashSet<String>,
528    /// Named terminal kinds whose underlying `PATTERN` runs to the end of
529    /// the source line (an unbounded trailing `.*` / `.+`, e.g. JS's
530    /// `hash_bang_line = #!.*`). Like a line comment, such a token absorbs
531    /// any text that follows it on the same line, so the layout pass emits a
532    /// newline after it: otherwise the next sibling re-parses as part of the
533    /// token. See `is_rest_of_line_pattern`.
534    #[serde(skip)]
535    pub line_rest_kinds: std::collections::HashSet<String>,
536    /// Named alias values whose ALIAS content reduces to an `IMMEDIATE_TOKEN`
537    /// (e.g. C's `char_literal` body `ALIAS{IMMEDIATE_TOKEN PATTERN "[^\n']",
538    /// value: "character"}`). The lexer admits such a token only with no
539    /// preceding whitespace, so the emitter hugs it to its predecessor: the
540    /// alias-value carries no grammar rule, so the rule-head `IMMEDIATE_TOKEN`
541    /// no-space check in `emit_vertex` never fires for it. Emitting these
542    /// leaves tight keeps `'hey'` from re-spacing to `' h e y'` (whose spaces
543    /// re-parse as extra `character` nodes).
544    #[serde(skip)]
545    pub immediate_token_alias_kinds: std::collections::HashSet<String>,
546    /// Text to emit for an external *closing* delimiter whose matching
547    /// *opener* is a literal `STRING` (a rule shaped `SEQ[STRING q, body..,
548    /// EXTERNAL close]`, the asymmetric twin of the all-external and
549    /// all-STRING delimiter shapes). TOML's `_multiline_basic_string =
550    /// SEQ[STRING """, REPEAT(..), _multiline_basic_string_end]` is the
551    /// canonical case: the open `"""` is a grammar literal, but the close is
552    /// a scanner external with no rule and no resolvable text, so the
553    /// emitter would drop it and leave the string unterminated. A multiline
554    /// string closes with the same delimiter it opens with, so the external
555    /// close emits the opener's literal. Derived purely from grammar
556    /// structure (the STRING-open / external-close envelope); stays in the
557    /// generic emitter.
558    #[serde(skip)]
559    pub external_close_text: std::collections::HashMap<String, String>,
560}
561
562pub(crate) fn deserialize_supertypes<'de, D>(
563    deserializer: D,
564) -> Result<std::collections::HashSet<String>, D::Error>
565where
566    D: serde::Deserializer<'de>,
567{
568    let entries: Vec<serde_json::Value> = Vec::deserialize(deserializer)?;
569    let mut out = std::collections::HashSet::new();
570    for entry in entries {
571        match entry {
572            serde_json::Value::String(s) => {
573                out.insert(s);
574            }
575            serde_json::Value::Object(map) => {
576                if let Some(serde_json::Value::String(name)) = map.get("name") {
577                    out.insert(name.clone());
578                }
579            }
580            _ => {}
581        }
582    }
583    Ok(out)
584}
585
586pub(crate) fn deserialize_extras<'de, D>(
587    deserializer: D,
588) -> Result<std::collections::HashSet<String>, D::Error>
589where
590    D: serde::Deserializer<'de>,
591{
592    let entries: Vec<serde_json::Value> = Vec::deserialize(deserializer)?;
593    let mut out = std::collections::HashSet::new();
594    for entry in entries {
595        if let serde_json::Value::Object(map) = entry {
596            let ty = map.get("type").and_then(serde_json::Value::as_str);
597            match ty {
598                // SYMBOL { name: K } — the extras rule is a named symbol
599                // (typically `line_comment` / `block_comment`). The kind
600                // K appears as a real child vertex on the schema side.
601                Some("SYMBOL") => {
602                    if let Some(serde_json::Value::String(name)) = map.get("name") {
603                        out.insert(name.clone());
604                    }
605                }
606                // ALIAS { content, value: V, named: true } — the extras
607                // rule renames its content; V is the kind on the schema.
608                Some("ALIAS") => {
609                    let named = map
610                        .get("named")
611                        .and_then(serde_json::Value::as_bool)
612                        .unwrap_or(false);
613                    if named {
614                        if let Some(serde_json::Value::String(value)) = map.get("value") {
615                            out.insert(value.clone());
616                        }
617                    }
618                }
619                // PATTERN / STRING / TOKEN entries describe inter-token
620                // whitespace and have no vertex-side representation.
621                _ => {}
622            }
623        }
624    }
625    Ok(out)
626}
627
628impl Grammar {
629    /// Parse a grammar's `grammar.json` bytes.
630    ///
631    /// Builds the subtyping closure as part of construction so every
632    /// downstream lookup is O(1). The closure is the least relation
633    /// containing `(K, K)` for every rule key `K` and closed under:
634    ///
635    /// - hidden-rule expansion: if `S` is hidden and a SYMBOL `S` may
636    ///   reach SYMBOL `K`, then `K ⊑ S`.
637    /// - supertype expansion: if `S` is in the grammar's supertypes
638    ///   block and `K` is one of `S`'s alternatives, then `K ⊑ S`.
639    /// - alias renaming: if a rule body contains
640    ///   `ALIAS { content: SYMBOL R, value: A, named: true }` where
641    ///   `R` reaches kind `K` (or `K = R` when no further hop), then
642    ///   `A ⊑ R` and `K ⊑ A`.
643    ///
644    /// # Errors
645    ///
646    /// Returns [`ParseError::EmitFailed`] when the bytes are not a
647    /// valid `grammar.json` document.
648    pub fn from_bytes(protocol: &str, bytes: &[u8]) -> Result<Self, ParseError> {
649        Self::from_bytes_with_node_types(protocol, bytes, None)
650    }
651
652    /// Parse a grammar from both `grammar.json` and optionally
653    /// `node-types.json` bytes.
654    ///
655    /// # Errors
656    ///
657    /// Returns [`ParseError::EmitFailed`] when `grammar_bytes` is
658    /// not a valid `grammar.json` document.
659    pub fn from_bytes_with_node_types(
660        protocol: &str,
661        grammar_bytes: &[u8],
662        node_types_bytes: Option<&[u8]>,
663    ) -> Result<Self, ParseError> {
664        let mut grammar: Self =
665            serde_json::from_slice(grammar_bytes).map_err(|e| ParseError::EmitFailed {
666                protocol: protocol.to_owned(),
667                reason: format!("grammar.json deserialization failed: {e}"),
668            })?;
669        // The `rules` BTreeMap loses grammar.json's insertion order, but
670        // tree-sitter's START SYMBOL is the FIRST rule as written. Recover
671        // it from the raw bytes so precomputes keyed on the start symbol
672        // (top-level document text) use the right entry point.
673        grammar.start_symbol = extract_start_symbol(grammar_bytes);
674        grammar.subtypes = compute_subtype_closure(&grammar);
675        grammar.named_alias_map = build_named_alias_map(&grammar);
676        grammar.named_alias_sources = build_named_alias_sources(&grammar);
677        grammar.yield_sets = compute_yield_sets(&grammar);
678        if let Some(nt_bytes) = node_types_bytes {
679            let (all_children, field_children, nonfield_children) =
680                build_node_type_children(nt_bytes);
681            grammar.node_type_children = all_children;
682            grammar.node_type_field_children = field_children;
683            grammar.node_type_nonfield_children = nonfield_children;
684            // Repair grammar.json/parser.c FIELD-name drift before any
685            // field-driven precompute or augmentation reads the rule bodies,
686            // so the corrected names flow everywhere downstream.
687            reconcile_field_names(&mut grammar);
688            augment_subtypes_from_node_types(&mut grammar);
689        }
690        grammar.yield_sets = compute_yield_sets(&grammar);
691        grammar.external_alias_map = build_external_alias_map(&grammar);
692        let (token_roles, indent_triggers) = compute_token_roles(&grammar);
693        grammar.token_roles = token_roles;
694        grammar.indent_triggers = indent_triggers;
695        grammar.line_comment_prefixes = extract_line_comment_prefixes(&grammar);
696        let (tb_markers, tb_ws) = classify_trailing_break_markers(&grammar);
697        grammar.trailing_break_markers = tb_markers;
698        grammar.trailing_break_on_whitespace = tb_ws;
699        grammar.top_level_text_admits_newline = classify_top_level_text_admits_newline(&grammar);
700        classify_external_layout_tokens(&mut grammar);
701        classify_external_bracket_delimiters(&mut grammar);
702        classify_external_close_text(&mut grammar);
703        classify_string_content_kinds(&mut grammar);
704        classify_synthetic_indent_rules(&mut grammar);
705        grammar.leading_space_terminals = classify_leading_space_terminals(&grammar);
706        grammar.line_rest_kinds = classify_line_rest_kinds(&grammar);
707        grammar.immediate_token_alias_kinds = classify_immediate_token_alias_kinds(&grammar);
708        grammar.yield_sets = compute_yield_sets(&grammar);
709        Ok(grammar)
710    }
711}
712
713/// Compute the subtyping relation as a forward-indexed map from a
714/// SYMBOL name to the set of vertex kinds that satisfy that SYMBOL.
715pub(crate) fn compute_subtype_closure(
716    grammar: &Grammar,
717) -> std::collections::HashMap<String, std::collections::HashSet<String>> {
718    use std::collections::{HashMap, HashSet};
719    // Edges of the "kind X satisfies SYMBOL Y" relation. `K ⊑ Y` is
720    // recorded whenever Y is reached by walking the grammar's
721    // ALIAS / hidden-rule / supertype dispatch from a position where
722    // K is the actual vertex kind.
723    let mut subtypes: HashMap<String, HashSet<String>> = HashMap::new();
724    for name in grammar.rules.keys() {
725        subtypes
726            .entry(name.clone())
727            .or_default()
728            .insert(name.clone());
729    }
730
731    // First pass: collect the immediate "satisfies" edges from each
732    // expandable rule (hidden, supertype) to the kinds reachable by
733    // walking its body, plus alias edges.
734    fn walk<'g>(
735        grammar: &'g Grammar,
736        production: &'g Production,
737        visited: &mut HashSet<&'g str>,
738        out: &mut HashSet<String>,
739    ) {
740        match production {
741            Production::Symbol { name } => {
742                // Direct subtype.
743                out.insert(name.clone());
744                // Continue expansion through hidden / supertype rules
745                // so the closure traverses pass-through dispatch.
746                let expand = name.starts_with('_') || grammar.supertypes.contains(name.as_str());
747                if expand && visited.insert(name.as_str()) {
748                    if let Some(rule) = grammar.rules.get(name) {
749                        walk(grammar, rule, visited, out);
750                    }
751                }
752            }
753            Production::Choice { members } => {
754                for m in members {
755                    walk(grammar, m, visited, out);
756                }
757            }
758            Production::Alias {
759                content,
760                named,
761                value,
762            } => {
763                if *named && !value.is_empty() {
764                    out.insert(value.clone());
765                }
766                walk(grammar, content, visited, out);
767            }
768            Production::Repeat { content }
769            | Production::Repeat1 { content }
770            | Production::Optional { content }
771            | Production::Field { content, .. }
772            | Production::Token { content }
773            | Production::ImmediateToken { content }
774            | Production::Prec { content, .. }
775            | Production::PrecLeft { content, .. }
776            | Production::PrecRight { content, .. }
777            | Production::PrecDynamic { content, .. }
778            | Production::Reserved { content, .. } => {
779                walk(grammar, content, visited, out);
780            }
781            _ => {}
782        }
783    }
784
785    for (name, rule) in &grammar.rules {
786        let expand = name.starts_with('_') || grammar.supertypes.contains(name.as_str());
787        if !expand {
788            continue;
789        }
790        let mut visited: HashSet<&str> = HashSet::new();
791        visited.insert(name.as_str());
792        let mut reachable: HashSet<String> = HashSet::new();
793        walk(grammar, rule, &mut visited, &mut reachable);
794        for kind in &reachable {
795            subtypes
796                .entry(kind.clone())
797                .or_default()
798                .insert(name.clone());
799        }
800    }
801
802    // Aliases: scan every rule body for ALIAS { content, value }
803    // declarations. The kinds reachable from `content` satisfy
804    // `value`, AND (by construction) `value` satisfies the
805    // surrounding rule. Walking the ENTIRE grammar once captures
806    // every alias site, irrespective of which rule introduces it.
807    fn collect_aliases<'g>(production: &'g Production, out: &mut Vec<(String, &'g Production)>) {
808        match production {
809            Production::Alias {
810                content,
811                named,
812                value,
813            } => {
814                if *named && !value.is_empty() {
815                    out.push((value.clone(), content.as_ref()));
816                }
817                collect_aliases(content, out);
818            }
819            Production::Choice { members } | Production::Seq { members } => {
820                for m in members {
821                    collect_aliases(m, out);
822                }
823            }
824            Production::Repeat { content }
825            | Production::Repeat1 { content }
826            | Production::Optional { content }
827            | Production::Field { content, .. }
828            | Production::Token { content }
829            | Production::ImmediateToken { content }
830            | Production::Prec { content, .. }
831            | Production::PrecLeft { content, .. }
832            | Production::PrecRight { content, .. }
833            | Production::PrecDynamic { content, .. }
834            | Production::Reserved { content, .. } => {
835                collect_aliases(content, out);
836            }
837            _ => {}
838        }
839    }
840    let mut aliases: Vec<(String, &Production)> = Vec::new();
841    for rule in grammar.rules.values() {
842        collect_aliases(rule, &mut aliases);
843    }
844    for (alias_value, content) in aliases {
845        let mut visited: HashSet<&str> = HashSet::new();
846        let mut reachable: HashSet<String> = HashSet::new();
847        walk(grammar, content, &mut visited, &mut reachable);
848        // Aliased value satisfies itself and is satisfied by every
849        // kind its content can reach.
850        subtypes
851            .entry(alias_value.clone())
852            .or_default()
853            .insert(alias_value.clone());
854        for kind in reachable {
855            subtypes
856                .entry(kind)
857                .or_default()
858                .insert(alias_value.clone());
859        }
860    }
861
862    // Transitive close through hidden and supertype rules via Tarjan SCC.
863    //
864    // The relation `K ⊑ Y` means "a vertex of kind K can appear where
865    // the grammar says SYMBOL Y." Transitivity applies when Y is a
866    // hidden or supertype rule (a dispatch point), NOT when Y is a
867    // concrete named rule. We build the directed graph G on dispatchable
868    // node names with edge Y → Z iff Z ∈ subtypes[Y] and Z is dispatchable.
869    // The transitive closure within G is the union of every reachable
870    // dispatchable node, which by Tarjan's theorem is computed in
871    // O(V + E) by contracting SCCs into a DAG, then unioning closures
872    // along reverse topological order.
873    let is_dispatch = |s: &str| s.starts_with('_') || grammar.supertypes.contains(s);
874    // 1. Nodes: every dispatchable name that appears as a key in subtypes
875    //    OR as a member of any subtypes value.
876    let mut nodes: HashSet<String> = HashSet::new();
877    for (k, vs) in &subtypes {
878        if is_dispatch(k) {
879            nodes.insert(k.clone());
880        }
881        for v in vs {
882            if is_dispatch(v) {
883                nodes.insert(v.clone());
884            }
885        }
886    }
887    let nodes: Vec<String> = nodes.into_iter().collect();
888    let index_of: HashMap<&str, usize> = nodes
889        .iter()
890        .enumerate()
891        .map(|(i, n)| (n.as_str(), i))
892        .collect();
893    // 2. Edges: Y → Z iff Z ∈ subtypes[Y] and both are dispatchable.
894    let mut edges: Vec<Vec<usize>> = vec![Vec::new(); nodes.len()];
895    for (i, name) in nodes.iter().enumerate() {
896        if let Some(targets) = subtypes.get(name) {
897            for t in targets {
898                if let Some(&j) = index_of.get(t.as_str()) {
899                    if i != j {
900                        edges[i].push(j);
901                    }
902                }
903            }
904        }
905    }
906
907    // 3. Tarjan SCC. `comp[v]` = SCC index of `v`. SCC indices come out
908    //    in reverse topological order (sinks first), which is exactly
909    //    the order we want for closure accumulation.
910    fn tarjan(edges: &[Vec<usize>]) -> Vec<usize> {
911        let n = edges.len();
912        let mut comp = vec![usize::MAX; n];
913        let mut index_arr = vec![usize::MAX; n];
914        let mut lowlink = vec![0usize; n];
915        let mut on_stack = vec![false; n];
916        let mut stack: Vec<usize> = Vec::new();
917        let mut next_index = 0usize;
918        let mut next_comp = 0usize;
919        // Iterative Tarjan to avoid stack overflow on large grammars.
920        let mut work: Vec<(usize, usize)> = Vec::new();
921        for start in 0..n {
922            if index_arr[start] != usize::MAX {
923                continue;
924            }
925            work.push((start, 0));
926            index_arr[start] = next_index;
927            lowlink[start] = next_index;
928            next_index += 1;
929            stack.push(start);
930            on_stack[start] = true;
931            while let Some(&(v, i)) = work.last() {
932                if i < edges[v].len() {
933                    let w = edges[v][i];
934                    if let Some(slot) = work.last_mut() {
935                        slot.1 += 1;
936                    }
937                    if index_arr[w] == usize::MAX {
938                        index_arr[w] = next_index;
939                        lowlink[w] = next_index;
940                        next_index += 1;
941                        stack.push(w);
942                        on_stack[w] = true;
943                        work.push((w, 0));
944                    } else if on_stack[w] && index_arr[w] < lowlink[v] {
945                        lowlink[v] = index_arr[w];
946                    }
947                } else {
948                    if lowlink[v] == index_arr[v] {
949                        while let Some(w) = stack.pop() {
950                            on_stack[w] = false;
951                            comp[w] = next_comp;
952                            if w == v {
953                                break;
954                            }
955                        }
956                        next_comp += 1;
957                    }
958                    let lv = lowlink[v];
959                    work.pop();
960                    if let Some(&(parent, _)) = work.last() {
961                        if lv < lowlink[parent] {
962                            lowlink[parent] = lv;
963                        }
964                    }
965                }
966            }
967        }
968        comp
969    }
970    let comp = tarjan(&edges);
971    let num_comps = comp.iter().max().copied().map_or(0, |m| m + 1);
972
973    // 4. For each SCC, accumulate the set of dispatchable nodes reachable
974    //    from it. SCCs are emitted in reverse topological order, so when
975    //    we process SCC c, every successor SCC has its closure already
976    //    computed.
977    let mut scc_members: Vec<Vec<usize>> = vec![Vec::new(); num_comps];
978    for (v, &c) in comp.iter().enumerate() {
979        scc_members[c].push(v);
980    }
981    let mut scc_closure: Vec<HashSet<String>> = vec![HashSet::new(); num_comps];
982    for c in 0..num_comps {
983        // Members of the SCC are mutually reachable.
984        let mut closure: HashSet<String> = HashSet::new();
985        for &v in &scc_members[c] {
986            closure.insert(nodes[v].clone());
987        }
988        // Successor SCCs' closures (already computed).
989        for &v in &scc_members[c] {
990            for &w in &edges[v] {
991                let wc = comp[w];
992                if wc != c {
993                    closure.extend(scc_closure[wc].iter().cloned());
994                }
995            }
996        }
997        scc_closure[c] = closure;
998    }
999
1000    // 5. Apply: for each kind K in `subtypes`, replace its dispatchable
1001    //    supertypes by their full closure. Non-dispatchable members
1002    //    (concrete kinds) stay as-is.
1003    let keys: Vec<String> = subtypes.keys().cloned().collect();
1004    for k in keys {
1005        let existing = subtypes.remove(&k).unwrap_or_default();
1006        let mut new_set: HashSet<String> = HashSet::new();
1007        for s in &existing {
1008            new_set.insert(s.clone());
1009            if let Some(&i) = index_of.get(s.as_str()) {
1010                new_set.extend(scc_closure[comp[i]].iter().cloned());
1011            }
1012        }
1013        subtypes.insert(k, new_set);
1014    }
1015
1016    subtypes
1017}
1018
1019/// Compute the Yield set for every rule in the grammar.
1020///
1021/// `Yield(P)` is the set of concrete vertex kinds that can appear as
1022/// the first named child when production P is taken. See the
1023/// `Grammar::yield_sets` doc comment for the inductive definition.
1024pub(crate) fn compute_yield_sets(
1025    grammar: &Grammar,
1026) -> std::collections::HashMap<String, std::collections::HashSet<String>> {
1027    let mut cache: std::collections::HashMap<String, std::collections::HashSet<String>> =
1028        std::collections::HashMap::new();
1029    for (name, rule) in &grammar.rules {
1030        let expand = name.starts_with('_') || grammar.supertypes.contains(name.as_str());
1031        if !expand {
1032            continue;
1033        }
1034        if cache.contains_key(name) {
1035            continue;
1036        }
1037        let mut visited = std::collections::HashSet::new();
1038        let ys = yield_of_production(grammar, rule, &mut visited, &mut cache);
1039        cache.insert(name.clone(), ys);
1040    }
1041    cache
1042}
1043
1044/// Compute the Yield set of an arbitrary production node.
1045///
1046/// Uses `cache` (the partially-built `yield_sets` map) as
1047/// memoization. `visited` tracks the current recursion path to
1048/// detect cycles through hidden/supertype rules; a cycle returns ∅
1049/// (a cycle that never passes through a concrete named symbol
1050/// cannot produce a first child).
1051pub(crate) fn yield_of_production(
1052    grammar: &Grammar,
1053    production: &Production,
1054    visited: &mut std::collections::HashSet<String>,
1055    cache: &mut std::collections::HashMap<String, std::collections::HashSet<String>>,
1056) -> std::collections::HashSet<String> {
1057    match production {
1058        Production::Symbol { name } => {
1059            let expand = name.starts_with('_') || grammar.supertypes.contains(name.as_str());
1060            if !expand {
1061                let mut set = std::collections::HashSet::new();
1062                set.insert(name.clone());
1063                return set;
1064            }
1065            if let Some(cached) = cache.get(name) {
1066                return cached.clone();
1067            }
1068            {
1069                if !visited.insert(name.clone()) {
1070                    return std::collections::HashSet::new();
1071                }
1072                let result = if let Some(rule) = grammar.rules.get(name) {
1073                    yield_of_production(grammar, rule, visited, cache)
1074                } else {
1075                    std::collections::HashSet::new()
1076                };
1077                visited.remove(name);
1078                cache.insert(name.clone(), result.clone());
1079                result
1080            }
1081        }
1082        Production::Alias {
1083            content,
1084            named,
1085            value,
1086        } => {
1087            if *named && !value.is_empty() {
1088                let mut set = std::collections::HashSet::new();
1089                set.insert(value.clone());
1090                set
1091            } else {
1092                yield_of_production(grammar, content, visited, cache)
1093            }
1094        }
1095        Production::Seq { members } => {
1096            if members.is_empty() {
1097                let mut set = std::collections::HashSet::new();
1098                set.insert(String::new());
1099                set
1100            } else {
1101                // Walk SEQ members left-to-right. STRING/PATTERN yield ∅
1102                // (anonymous tokens, skipped). Named-child-producing
1103                // members yield a non-empty set. If that set contains ε,
1104                // the member is optional and the next member's yield is
1105                // also reachable. Accumulate until we hit a non-optional
1106                // named-child producer.
1107                let mut combined = std::collections::HashSet::new();
1108                for m in members {
1109                    let ys = yield_of_production(grammar, m, visited, cache);
1110                    if ys.is_empty() {
1111                        continue;
1112                    }
1113                    let has_epsilon = ys.contains("");
1114                    combined.extend(ys);
1115                    if !has_epsilon {
1116                        break;
1117                    }
1118                }
1119                combined
1120            }
1121        }
1122        Production::Choice { members } => {
1123            let mut union = std::collections::HashSet::new();
1124            for m in members {
1125                union.extend(yield_of_production(grammar, m, visited, cache));
1126            }
1127            union
1128        }
1129        Production::Optional { content } => {
1130            let mut set = yield_of_production(grammar, content, visited, cache);
1131            set.insert(String::new());
1132            set
1133        }
1134        Production::Blank => {
1135            let mut set = std::collections::HashSet::new();
1136            set.insert(String::new());
1137            set
1138        }
1139        Production::String { .. } | Production::Pattern { .. } => std::collections::HashSet::new(),
1140        Production::Repeat { content } => {
1141            let mut set = yield_of_production(grammar, content, visited, cache);
1142            set.insert(String::new());
1143            set
1144        }
1145        Production::Repeat1 { content }
1146        | Production::Field { content, .. }
1147        | Production::Token { content }
1148        | Production::ImmediateToken { content }
1149        | Production::Prec { content, .. }
1150        | Production::PrecLeft { content, .. }
1151        | Production::PrecRight { content, .. }
1152        | Production::PrecDynamic { content, .. }
1153        | Production::Reserved { content, .. } => {
1154            yield_of_production(grammar, content, visited, cache)
1155        }
1156    }
1157}
1158
1159// ═══════════════════════════════════════════════════════════════════
1160// node-types.json integration
1161// ═══════════════════════════════════════════════════════════════════
1162
1163/// Parse node-types.json and build a map from parent kind to the set
1164/// of all named child kinds the parser can produce for that parent.
1165pub(crate) type NodeTypeResult = (
1166    std::collections::HashMap<String, std::collections::HashSet<String>>,
1167    std::collections::HashMap<
1168        String,
1169        std::collections::HashMap<String, std::collections::HashSet<String>>,
1170    >,
1171    std::collections::HashMap<String, std::collections::HashSet<String>>,
1172);
1173
1174pub(crate) fn build_node_type_children(nt_bytes: &[u8]) -> NodeTypeResult {
1175    use std::collections::{HashMap, HashSet};
1176    // Use the resilient parser, not a direct `from_slice`: recent
1177    // tree-sitter releases append non-node metadata markers (e.g. erlang's
1178    // `{"@generated": true}`) with no `type` field, which a strict
1179    // `Vec<NodeType>` deserialization rejects — silently zeroing the whole
1180    // node-types child map (and with it FIELD-name reconciliation and the
1181    // subtype augmentation it feeds).
1182    let node_types: Vec<crate::theory_extract::NodeType> =
1183        match crate::theory_extract::parse_node_types(nt_bytes) {
1184            Ok(v) => v,
1185            Err(_) => return (HashMap::new(), HashMap::new(), HashMap::new()),
1186        };
1187    let mut all_map: HashMap<String, HashSet<String>> = HashMap::new();
1188    let mut field_map: HashMap<String, HashMap<String, HashSet<String>>> = HashMap::new();
1189    let mut nonfield_map: HashMap<String, HashSet<String>> = HashMap::new();
1190    for entry in &node_types {
1191        if !entry.named {
1192            continue;
1193        }
1194        let mut child_kinds = HashSet::new();
1195        for (field_name, field_value) in &entry.fields {
1196            if let Some(types) = field_value.get("types").and_then(|t| t.as_array()) {
1197                for t in types {
1198                    if let (Some(name), Some(true)) = (
1199                        t.get("type").and_then(|n| n.as_str()),
1200                        t.get("named").and_then(serde_json::Value::as_bool),
1201                    ) {
1202                        child_kinds.insert(name.to_owned());
1203                        field_map
1204                            .entry(entry.node_type.clone())
1205                            .or_default()
1206                            .entry(field_name.clone())
1207                            .or_default()
1208                            .insert(name.to_owned());
1209                    }
1210                }
1211            }
1212        }
1213        if let Some(ref children) = entry.children {
1214            for t in &children.types {
1215                if t.named {
1216                    child_kinds.insert(t.node_type.clone());
1217                    nonfield_map
1218                        .entry(entry.node_type.clone())
1219                        .or_default()
1220                        .insert(t.node_type.clone());
1221                }
1222            }
1223        }
1224        if !child_kinds.is_empty() {
1225            all_map.insert(entry.node_type.clone(), child_kinds);
1226        }
1227    }
1228    (all_map, field_map, nonfield_map)
1229}
1230
1231/// Augment `grammar.subtypes` with child-kind data from node-types.json.
1232///
1233/// Uses per-field structural matching: for each parent kind P, each field
1234/// F in P's node-types.json entry, and each child kind C in field F's
1235/// types, find the SYMBOL S referenced at field F's position in P's
1236/// grammar rule. If C lacks a grammar rule and does not already satisfy S,
1237/// record C ⊑ S. Non-field children are matched against non-FIELD symbols
1238/// in the rule body.
1239pub(crate) fn augment_subtypes_from_node_types(grammar: &mut Grammar) {
1240    use std::collections::HashMap;
1241
1242    // Build per-field child-kind map from node-types.json by re-parsing.
1243    let mut pairs: Vec<(String, String)> = Vec::new();
1244    for parent_kind in grammar.node_type_children.keys() {
1245        let Some(rule) = grammar.rules.get(parent_kind) else {
1246            continue;
1247        };
1248
1249        // Collect symbols from the grammar rule, partitioned by the
1250        // FIELD they appear in (or non-field for top-level symbols).
1251        let mut field_symbols: HashMap<String, Vec<String>> = HashMap::new();
1252        let mut non_field_symbols: Vec<String> = Vec::new();
1253        collect_field_symbols(rule, &mut field_symbols, &mut non_field_symbols, false);
1254
1255        // A node-types child kind `K` satisfies a grammar symbol `S` only
1256        // when `S` is a legitimate *dispatch target* that can transparently
1257        // yield `K` (a hidden `_`-rule, a declared supertype, or a rule with
1258        // no literal tokens of its own that inlines under another kind). A
1259        // *concrete self-anchored* rule (own literal tokens / brackets /
1260        // keywords, e.g. cpp `attribute_declaration = SEQ["[[", …, "]]"]`)
1261        // always materialises under its OWN name, so it can never be the
1262        // surface kind of a *different* child. The node-types `children.types`
1263        // list is a flat sibling set: a rule with several distinct non-field
1264        // children (cpp `base_class_clause` = access_specifier + type_identifier
1265        // + attribute_declaration) would otherwise cross-product every child
1266        // kind with every symbol, spuriously making `type_identifier ⊑
1267        // attribute_declaration` and letting a leading REPEAT(attribute_decl)
1268        // steal the `type_identifier` (base-class reorder). Mirror `sat`'s
1269        // non-transitivity: skip self-anchored concrete targets.
1270        let dispatch_target = |grammar: &Grammar, sym: &str| -> bool {
1271            sym.starts_with('_')
1272                || grammar.supertypes.contains(sym)
1273                || grammar
1274                    .rules
1275                    .get(sym)
1276                    .is_none_or(|r| literal_strings(r).is_empty())
1277        };
1278
1279        // Per-field augmentation: for each FIELD F in the grammar rule,
1280        // match child kinds that node-types.json says appear in field F
1281        // against the symbols at field F's position.
1282        if let Some(nt_fields) = grammar.node_type_field_children.get(parent_kind) {
1283            for (field_name, nt_child_kinds) in nt_fields {
1284                let Some(rule_syms) = field_symbols.get(field_name) else {
1285                    continue;
1286                };
1287                for child_kind in nt_child_kinds {
1288                    if grammar.rules.contains_key(child_kind) {
1289                        continue;
1290                    }
1291                    for sym_name in rule_syms {
1292                        if dispatch_target(grammar, sym_name)
1293                            && !kind_satisfies_symbol(grammar, Some(child_kind), sym_name)
1294                        {
1295                            pairs.push((child_kind.clone(), sym_name.clone()));
1296                        }
1297                    }
1298                }
1299            }
1300        }
1301
1302        // Non-field augmentation: for child kinds from `children.types`
1303        // (no field), match against non-FIELD symbols in the rule.
1304        if let Some(nt_nonfield) = grammar.node_type_nonfield_children.get(parent_kind) {
1305            for child_kind in nt_nonfield {
1306                if grammar.rules.contains_key(child_kind) {
1307                    continue;
1308                }
1309                for sym_name in &non_field_symbols {
1310                    if dispatch_target(grammar, sym_name)
1311                        && !kind_satisfies_symbol(grammar, Some(child_kind), sym_name)
1312                    {
1313                        pairs.push((child_kind.clone(), sym_name.clone()));
1314                    }
1315                }
1316            }
1317        }
1318    }
1319    for (child_kind, sym_name) in pairs {
1320        grammar
1321            .subtypes
1322            .entry(child_kind)
1323            .or_default()
1324            .insert(sym_name);
1325    }
1326}
1327
1328/// Walk a production and collect referenced symbols, separating those
1329/// inside FIELD bodies (keyed by field name) from those outside any FIELD.
1330/// Reconcile grammar.json `FIELD` names against node-types.json when they
1331/// desync. The vendored `grammar.json` and the generated `parser.c`
1332/// (reflected by `node-types.json`) can drift: a `FIELD(exprs, _expr)` in
1333/// `grammar.json` may correspond to a field the parser actually emits as
1334/// `expr` (erlang `list_comprehension`'s template). The schema's child
1335/// edges carry the *parser's* field name, so the emit walker's
1336/// `take_field("exprs")` finds nothing and the child is dropped.
1337///
1338/// `node-types.json` is authoritative for the parser's field names. For
1339/// each rule whose kind appears there, when there is exactly ONE grammar
1340/// `FIELD` name absent from the node-types field set and exactly ONE
1341/// node-types field name absent from the grammar's field set, the grammar
1342/// name is the stale one: rewrite every `FIELD` carrying it to the
1343/// node-types name. The 1-to-1 constraint keeps this to the unambiguous
1344/// rename case (a genuine drift), never a structural reinterpretation.
1345pub(crate) fn reconcile_field_names(grammar: &mut Grammar) {
1346    use std::collections::HashSet;
1347    let mut renames: Vec<(String, String, String)> = Vec::new();
1348    for (kind, nt_fields) in &grammar.node_type_field_children {
1349        let Some(rule) = grammar.rules.get(kind) else {
1350            continue;
1351        };
1352        let mut grammar_fields: HashSet<String> = HashSet::new();
1353        collect_grammar_field_names(rule, &mut grammar_fields);
1354        let nt_names: HashSet<&String> = nt_fields.keys().collect();
1355        let grammar_only: Vec<&String> = grammar_fields
1356            .iter()
1357            .filter(|f| !nt_names.contains(f))
1358            .collect();
1359        let nt_only: Vec<&String> = nt_fields
1360            .keys()
1361            .filter(|f| !grammar_fields.contains(*f))
1362            .collect();
1363        // Exactly one stale grammar field and one unmatched parser field:
1364        // the unambiguous drift. (Zero-diff rules and many-to-many
1365        // reshuffles are left untouched.)
1366        if grammar_only.len() == 1 && nt_only.len() == 1 {
1367            renames.push((kind.clone(), grammar_only[0].clone(), nt_only[0].clone()));
1368        }
1369    }
1370    for (kind, from, to) in renames {
1371        if let Some(rule) = grammar.rules.get_mut(&kind) {
1372            rename_field_in(rule, &from, &to);
1373        }
1374    }
1375}
1376
1377/// Collect the set of `FIELD` names appearing anywhere in `prod`.
1378fn collect_grammar_field_names(prod: &Production, out: &mut std::collections::HashSet<String>) {
1379    match prod {
1380        Production::Field { name, content } => {
1381            out.insert(name.clone());
1382            collect_grammar_field_names(content, out);
1383        }
1384        Production::Choice { members } | Production::Seq { members } => {
1385            for m in members {
1386                collect_grammar_field_names(m, out);
1387            }
1388        }
1389        Production::Repeat { content }
1390        | Production::Repeat1 { content }
1391        | Production::Optional { content }
1392        | Production::Alias { content, .. }
1393        | Production::Token { content }
1394        | Production::ImmediateToken { content }
1395        | Production::Prec { content, .. }
1396        | Production::PrecLeft { content, .. }
1397        | Production::PrecRight { content, .. }
1398        | Production::PrecDynamic { content, .. }
1399        | Production::Reserved { content, .. } => collect_grammar_field_names(content, out),
1400        _ => {}
1401    }
1402}
1403
1404/// Rewrite every `FIELD(from, …)` to `FIELD(to, …)` within `prod`.
1405fn rename_field_in(prod: &mut Production, from: &str, to: &str) {
1406    match prod {
1407        Production::Field { name, content } => {
1408            if name == from {
1409                to.clone_into(name);
1410            }
1411            rename_field_in(content, from, to);
1412        }
1413        Production::Choice { members } | Production::Seq { members } => {
1414            for m in members {
1415                rename_field_in(m, from, to);
1416            }
1417        }
1418        Production::Repeat { content }
1419        | Production::Repeat1 { content }
1420        | Production::Optional { content }
1421        | Production::Alias { content, .. }
1422        | Production::Token { content }
1423        | Production::ImmediateToken { content }
1424        | Production::Prec { content, .. }
1425        | Production::PrecLeft { content, .. }
1426        | Production::PrecRight { content, .. }
1427        | Production::PrecDynamic { content, .. }
1428        | Production::Reserved { content, .. } => rename_field_in(content, from, to),
1429        _ => {}
1430    }
1431}
1432
1433pub(crate) fn collect_field_symbols(
1434    prod: &Production,
1435    field_map: &mut std::collections::HashMap<String, Vec<String>>,
1436    non_field: &mut Vec<String>,
1437    inside_field: bool,
1438) {
1439    match prod {
1440        Production::Symbol { name } if !inside_field => {
1441            non_field.push(name.clone());
1442        }
1443        Production::Field { name, content } => {
1444            let mut syms = Vec::new();
1445            collect_symbols_flat(content, &mut syms);
1446            field_map.entry(name.clone()).or_default().extend(syms);
1447        }
1448        Production::Choice { members } | Production::Seq { members } => {
1449            for m in members {
1450                collect_field_symbols(m, field_map, non_field, inside_field);
1451            }
1452        }
1453        Production::Repeat { content }
1454        | Production::Repeat1 { content }
1455        | Production::Optional { content }
1456        | Production::Alias { content, .. }
1457        | Production::Token { content }
1458        | Production::ImmediateToken { content }
1459        | Production::Prec { content, .. }
1460        | Production::PrecLeft { content, .. }
1461        | Production::PrecRight { content, .. }
1462        | Production::PrecDynamic { content, .. }
1463        | Production::Reserved { content, .. } => {
1464            collect_field_symbols(content, field_map, non_field, inside_field);
1465        }
1466        _ => {}
1467    }
1468}
1469
1470pub(crate) fn collect_symbols_flat(prod: &Production, out: &mut Vec<String>) {
1471    match prod {
1472        Production::Symbol { name } => out.push(name.clone()),
1473        Production::Choice { members } | Production::Seq { members } => {
1474            for m in members {
1475                collect_symbols_flat(m, out);
1476            }
1477        }
1478        Production::Repeat { content }
1479        | Production::Repeat1 { content }
1480        | Production::Optional { content }
1481        | Production::Alias { content, .. }
1482        | Production::Field { content, .. }
1483        | Production::Token { content }
1484        | Production::ImmediateToken { content }
1485        | Production::Prec { content, .. }
1486        | Production::PrecLeft { content, .. }
1487        | Production::PrecRight { content, .. }
1488        | Production::PrecDynamic { content, .. }
1489        | Production::Reserved { content, .. } => collect_symbols_flat(content, out),
1490        _ => {}
1491    }
1492}
1493
1494/// Build a map from external scanner symbol names to their anonymous
1495/// ALIAS values by walking every rule body in the grammar.
1496pub(crate) fn build_external_alias_map(
1497    grammar: &Grammar,
1498) -> std::collections::HashMap<String, String> {
1499    let mut map = std::collections::HashMap::new();
1500    fn walk(
1501        grammar: &Grammar,
1502        prod: &Production,
1503        map: &mut std::collections::HashMap<String, String>,
1504    ) {
1505        match prod {
1506            Production::Alias {
1507                content,
1508                named,
1509                value,
1510            } => {
1511                if !*named && !value.is_empty() {
1512                    if let Production::Symbol { name } = content.as_ref() {
1513                        if name.starts_with('_') && !grammar.rules.contains_key(name) {
1514                            map.entry(name.clone()).or_insert_with(|| value.clone());
1515                        }
1516                    }
1517                }
1518                walk(grammar, content, map);
1519            }
1520            Production::Choice { members } | Production::Seq { members } => {
1521                for m in members {
1522                    walk(grammar, m, map);
1523                }
1524            }
1525            Production::Repeat { content }
1526            | Production::Repeat1 { content }
1527            | Production::Optional { content }
1528            | Production::Field { content, .. }
1529            | Production::Token { content }
1530            | Production::ImmediateToken { content }
1531            | Production::Prec { content, .. }
1532            | Production::PrecLeft { content, .. }
1533            | Production::PrecRight { content, .. }
1534            | Production::PrecDynamic { content, .. }
1535            | Production::Reserved { content, .. } => walk(grammar, content, map),
1536            _ => {}
1537        }
1538    }
1539    for rule in grammar.rules.values() {
1540        walk(grammar, rule, &mut map);
1541    }
1542    map
1543}
1544
1545/// Build a map from named-alias values to their source symbol names.
1546/// When tree-sitter emits a vertex with kind `V` via
1547/// `alias($.source, $.V)`, the grammar has no rule keyed by `V`.
1548/// This map lets the emitter resolve `V → source` and walk the source
1549/// rule with proper token roles and bracket pairs.
1550pub(crate) fn build_named_alias_map(
1551    grammar: &Grammar,
1552) -> std::collections::HashMap<String, String> {
1553    let mut map = std::collections::HashMap::new();
1554    fn walk(prod: &Production, map: &mut std::collections::HashMap<String, String>) {
1555        match prod {
1556            Production::Alias {
1557                content,
1558                named,
1559                value,
1560            } => {
1561                if *named && !value.is_empty() {
1562                    if let Production::Symbol { name } = content.as_ref() {
1563                        map.entry(value.clone()).or_insert_with(|| name.clone());
1564                    }
1565                }
1566                walk(content, map);
1567            }
1568            Production::Choice { members } | Production::Seq { members } => {
1569                for m in members {
1570                    walk(m, map);
1571                }
1572            }
1573            Production::Repeat { content }
1574            | Production::Repeat1 { content }
1575            | Production::Optional { content }
1576            | Production::Field { content, .. }
1577            | Production::Token { content }
1578            | Production::ImmediateToken { content }
1579            | Production::Prec { content, .. }
1580            | Production::PrecLeft { content, .. }
1581            | Production::PrecRight { content, .. }
1582            | Production::PrecDynamic { content, .. }
1583            | Production::Reserved { content, .. } => walk(content, map),
1584            _ => {}
1585        }
1586    }
1587    for rule in grammar.rules.values() {
1588        walk(rule, &mut map);
1589    }
1590    map
1591}
1592
1593/// Every source rule that aliases to each kind, deduplicated, in
1594/// first-seen order. Unlike [`build_named_alias_map`] (which keeps only
1595/// the first source per kind) this records the full set, so the emitter
1596/// can choose, among several rules that collapse to the same surface
1597/// kind, the one whose production admits a given vertex's children.
1598pub(crate) fn build_named_alias_sources(
1599    grammar: &Grammar,
1600) -> std::collections::HashMap<String, Vec<String>> {
1601    let mut map: std::collections::HashMap<String, Vec<String>> = std::collections::HashMap::new();
1602    fn walk(prod: &Production, map: &mut std::collections::HashMap<String, Vec<String>>) {
1603        match prod {
1604            Production::Alias {
1605                content,
1606                named,
1607                value,
1608            } => {
1609                if *named && !value.is_empty() {
1610                    if let Production::Symbol { name } = content.as_ref() {
1611                        let srcs = map.entry(value.clone()).or_default();
1612                        if !srcs.contains(name) {
1613                            srcs.push(name.clone());
1614                        }
1615                    }
1616                }
1617                walk(content, map);
1618            }
1619            Production::Choice { members } | Production::Seq { members } => {
1620                for m in members {
1621                    walk(m, map);
1622                }
1623            }
1624            Production::Repeat { content }
1625            | Production::Repeat1 { content }
1626            | Production::Optional { content }
1627            | Production::Field { content, .. }
1628            | Production::Token { content }
1629            | Production::ImmediateToken { content }
1630            | Production::Prec { content, .. }
1631            | Production::PrecLeft { content, .. }
1632            | Production::PrecRight { content, .. }
1633            | Production::PrecDynamic { content, .. }
1634            | Production::Reserved { content, .. } => walk(content, map),
1635            _ => {}
1636        }
1637    }
1638    for rule in grammar.rules.values() {
1639        walk(rule, &mut map);
1640    }
1641    map
1642}
1643
1644/// Compute token roles for every STRING value in every grammar rule.
1645///
1646/// For each rule R, analyzes the production body to classify every
1647/// STRING token by its structural role (bracket-open, bracket-close,
1648/// separator, keyword, operator). Also identifies which bracket-open
1649/// tokens trigger indentation (those with REPEAT/REPEAT1 between
1650/// the open and close).
1651///
1652/// Bracket pairs are detected per-SEQ, not from a fixed character
1653/// set. Two STRINGs are a matched pair iff they are the first and
1654/// last STRING-typed members of the same SEQ with at least one
1655/// non-STRING member between them and open != close.
1656pub(crate) type RoleMap =
1657    std::collections::HashMap<String, std::collections::HashMap<String, TokenRole>>;
1658
1659pub(crate) type IndentSet = std::collections::HashSet<(String, String)>;
1660
1661pub(crate) fn compute_token_roles(grammar: &Grammar) -> (RoleMap, IndentSet) {
1662    use std::collections::{HashMap, HashSet};
1663    let mut all_roles: HashMap<String, HashMap<String, TokenRole>> = HashMap::new();
1664    let mut indent_triggers: HashSet<(String, String)> = HashSet::new();
1665
1666    for (rule_name, rule) in &grammar.rules {
1667        let mut roles: HashMap<String, TokenRole> = HashMap::new();
1668        classify_production(rule, &mut roles, &mut indent_triggers, rule_name);
1669        if !roles.is_empty() {
1670            all_roles.insert(rule_name.clone(), roles);
1671        }
1672    }
1673
1674    (all_roles, indent_triggers)
1675}
1676
1677/// Recursively classify STRING tokens in a production body.
1678pub(crate) fn classify_production(
1679    prod: &Production,
1680    roles: &mut std::collections::HashMap<String, TokenRole>,
1681    indent_triggers: &mut std::collections::HashSet<(String, String)>,
1682    rule_name: &str,
1683) {
1684    match prod {
1685        Production::Seq { members } => {
1686            classify_seq(members, roles, indent_triggers, rule_name, false);
1687        }
1688        Production::Choice { members } => {
1689            for m in members {
1690                // CHOICE alternatives' SEQs get in_choice=true so that
1691                // position-0 STRINGs are classified as Operators (not
1692                // prefix sigils). E.g. `=` in `CHOICE [SEQ ["=", ...]]`
1693                // is an operator, not a prefix.
1694                match m {
1695                    Production::Seq {
1696                        members: seq_members,
1697                    } => {
1698                        classify_seq(seq_members, roles, indent_triggers, rule_name, true);
1699                    }
1700                    _ => classify_production(m, roles, indent_triggers, rule_name),
1701                }
1702            }
1703        }
1704        Production::Repeat { content } | Production::Repeat1 { content } => {
1705            classify_repeat_body(content, roles, indent_triggers, rule_name);
1706        }
1707        Production::Optional { content }
1708        | Production::Field { content, .. }
1709        | Production::Token { content }
1710        | Production::ImmediateToken { content }
1711        | Production::Prec { content, .. }
1712        | Production::PrecLeft { content, .. }
1713        | Production::PrecRight { content, .. }
1714        | Production::PrecDynamic { content, .. }
1715        | Production::Reserved { content, .. } => {
1716            classify_production(content, roles, indent_triggers, rule_name);
1717        }
1718        Production::Alias { content, .. } => {
1719            classify_production(content, roles, indent_triggers, rule_name);
1720        }
1721        _ => {}
1722    }
1723}
1724
1725/// Classify STRING tokens within a SEQ. This is where bracket pairs
1726/// are detected and roles assigned.
1727pub(crate) fn classify_seq(
1728    members: &[Production],
1729    roles: &mut std::collections::HashMap<String, TokenRole>,
1730    indent_triggers: &mut std::collections::HashSet<(String, String)>,
1731    rule_name: &str,
1732    in_choice: bool,
1733) {
1734    let string_positions: Vec<(usize, &str)> = members
1735        .iter()
1736        .enumerate()
1737        .filter_map(|(i, m)| unwrap_to_string(m).map(|s| (i, s)))
1738        .collect();
1739
1740    let content_count = members
1741        .iter()
1742        .filter(|m| unwrap_to_string(m).is_none())
1743        .count();
1744
1745    if string_positions.len() >= 2 {
1746        let (first_idx, first_val) = string_positions[0];
1747        let (last_idx, last_val) = string_positions[string_positions.len() - 1];
1748
1749        let has_content_between = members[first_idx + 1..last_idx]
1750            .iter()
1751            .any(|m| unwrap_to_string(m).is_none());
1752
1753        let both_punct = !is_word_like(first_val) && !is_word_like(last_val);
1754        let both_word = is_word_like(first_val) && is_word_like(last_val);
1755        if has_content_between && first_val != last_val && (both_punct || both_word) {
1756            roles.insert(first_val.to_owned(), TokenRole::BracketOpen);
1757            roles.insert(last_val.to_owned(), TokenRole::BracketClose);
1758
1759            let between = &members[first_idx + 1..last_idx];
1760            if first_val == "{" && has_repeat_recursive(between) {
1761                indent_triggers.insert((rule_name.to_owned(), first_val.to_owned()));
1762            }
1763        }
1764    }
1765
1766    // An optional leading unary sign (`CHOICE[- | BLANK]` / `OPTIONAL(-)`
1767    // at the head of the SEQ, with an operand after it) is a tight prefix
1768    // on that operand: `signed_number = SEQ[CHOICE[- | BLANK], number]`
1769    // emits `-1.0`, not `- 1.0`. The sign lives inside a CHOICE, so the
1770    // per-position pass below never sees it; classify it here.
1771    if members.len() >= 2 {
1772        if let Some(first) = members.first() {
1773            let has_following_content = members[1..].iter().any(|m| unwrap_to_string(m).is_none());
1774            if has_following_content {
1775                for sign in leading_optional_sign(first) {
1776                    roles.entry(sign).or_insert(TokenRole::BracketOpen);
1777                }
1778            }
1779        }
1780    }
1781
1782    // Classify remaining STRINGs by structural position.
1783    let first_content_idx = members.iter().position(|m| unwrap_to_string(m).is_none());
1784    let last_content_idx = members.iter().rposition(|m| unwrap_to_string(m).is_none());
1785
1786    for (i, m) in members.iter().enumerate() {
1787        if let Some(value) = unwrap_to_string(m) {
1788            let value = value.to_owned();
1789            if !roles.contains_key(&value) {
1790                if is_word_like(&value) {
1791                    roles.insert(value.clone(), TokenRole::Keyword);
1792                } else if !in_choice
1793                    && first_content_idx.is_some_and(|fc| i < fc)
1794                    && is_prefix_sigil(&value)
1795                {
1796                    roles.insert(value.clone(), TokenRole::BracketOpen);
1797                } else if last_content_idx.is_some_and(|lc| i > lc) {
1798                    // STRING after all content: suffix (tight before).
1799                    // Unlike prefix, this applies in CHOICE branches too
1800                    // (e.g. `()` in bash function_definition's CHOICE).
1801                    roles.insert(value.clone(), TokenRole::BracketClose);
1802                } else if !in_choice
1803                    && string_positions.len() == 1
1804                    && content_count == 2
1805                    && value.len() == 1
1806                {
1807                    // Single-character STRING between exactly two content
1808                    // members in a non-CHOICE SEQ: this is a connector
1809                    // (e.g. `.` in `SEQ [object, ".", attr]`).
1810                    // Multi-character tokens like `:=`, `<-`, `->` are
1811                    // operators (spaced), not connectors.
1812                    roles.insert(value.clone(), TokenRole::Connector);
1813                } else {
1814                    roles.insert(value.clone(), TokenRole::Operator);
1815                }
1816            }
1817        }
1818    }
1819
1820    for m in members {
1821        if unwrap_to_string(m).is_none() {
1822            classify_production(m, roles, indent_triggers, rule_name);
1823        }
1824    }
1825}
1826
1827/// Classify STRING tokens in a REPEAT body. The first STRING in a
1828/// REPEAT body's inner SEQ is a separator (e.g. `,` in
1829/// `REPEAT(SEQ [",", item])`).
1830pub(crate) fn classify_repeat_body(
1831    content: &Production,
1832    roles: &mut std::collections::HashMap<String, TokenRole>,
1833    indent_triggers: &mut std::collections::HashSet<(String, String)>,
1834    rule_name: &str,
1835) {
1836    match content {
1837        Production::Seq { members } => {
1838            if let Some(Production::String { value }) = members.first() {
1839                roles.insert(value.clone(), TokenRole::Separator);
1840            }
1841            classify_seq(members, roles, indent_triggers, rule_name, false);
1842        }
1843        _ => classify_production(content, roles, indent_triggers, rule_name),
1844    }
1845}
1846
1847/// Classify STRING tokens within a SEQ by structural position, returning
1848/// a role for each member position. Non-STRING positions get `None`.
1849/// This is the inline variant of `classify_seq` used at emission time
1850/// to avoid the flat per-rule map's conflation of same-text tokens.
1851pub(crate) fn classify_seq_positions(
1852    members: &[Production],
1853    in_choice: bool,
1854) -> Vec<Option<TokenRole>> {
1855    let mut roles: Vec<Option<TokenRole>> = vec![None; members.len()];
1856
1857    let string_positions: Vec<(usize, &str)> = members
1858        .iter()
1859        .enumerate()
1860        .filter_map(|(i, m)| unwrap_to_string(m).map(|s| (i, s)))
1861        .collect();
1862
1863    let content_count = members
1864        .iter()
1865        .filter(|m| unwrap_to_string(m).is_none())
1866        .count();
1867
1868    // Bracket pair detection.
1869    let mut bracket_open_idx: Option<usize> = None;
1870    let mut bracket_close_idx: Option<usize> = None;
1871
1872    // Canonical pairing first: pair an actual `(`/`[`/`{` with its matching
1873    // closer, even when other STRINGs (a prefix operator, a trailing `;`)
1874    // sit at the SEQ ends. `sampling_statement` (`expr ~ f ( args ) ;`)
1875    // must pair `(`/`)`, not `~`/`;`.
1876    for &(oi, ov) in &string_positions {
1877        let Some(close_text) = matching_close_bracket(ov) else {
1878            continue;
1879        };
1880        if let Some(&(ci, _)) = string_positions
1881            .iter()
1882            .rev()
1883            .find(|(_, v)| *v == close_text)
1884        {
1885            if oi < ci
1886                && members[oi + 1..ci]
1887                    .iter()
1888                    .any(|m| unwrap_to_string(m).is_none())
1889            {
1890                roles[oi] = Some(TokenRole::BracketOpen);
1891                roles[ci] = Some(TokenRole::BracketClose);
1892                bracket_open_idx = Some(oi);
1893                bracket_close_idx = Some(ci);
1894                break;
1895            }
1896        }
1897    }
1898
1899    // First/last STRING fallback: handles word-like pairs (begin/end) and
1900    // same-text immediate delimiters (regex `/.../`) that the canonical
1901    // search does not recognise.
1902    if bracket_open_idx.is_none() && string_positions.len() >= 2 {
1903        let (first_idx, first_val) = string_positions[0];
1904        let (last_idx, last_val) = string_positions[string_positions.len() - 1];
1905
1906        let has_content_between = members[first_idx + 1..last_idx]
1907            .iter()
1908            .any(|m| unwrap_to_string(m).is_none());
1909
1910        let both_punct = !is_word_like(first_val) && !is_word_like(last_val);
1911        let both_word = is_word_like(first_val) && is_word_like(last_val);
1912        // Same-text delimiters (e.g. regex `/.../`) are a bracket pair
1913        // when at least one side is IMMEDIATE_TOKEN — the grammar's
1914        // structural signal that the delimiter must be tight against
1915        // the content.
1916        let either_immediate =
1917            is_immediate_token(&members[first_idx]) || is_immediate_token(&members[last_idx]);
1918        let same_text_immediate = first_val == last_val && either_immediate;
1919        if has_content_between
1920            && (both_punct || both_word)
1921            && (first_val != last_val || same_text_immediate)
1922        {
1923            roles[first_idx] = Some(TokenRole::BracketOpen);
1924            roles[last_idx] = Some(TokenRole::BracketClose);
1925            bracket_open_idx = Some(first_idx);
1926            bracket_close_idx = Some(last_idx);
1927        }
1928    }
1929
1930    let first_content_idx = members.iter().position(|m| unwrap_to_string(m).is_none());
1931    let last_content_idx = members.iter().rposition(|m| unwrap_to_string(m).is_none());
1932
1933    for (i, m) in members.iter().enumerate() {
1934        if roles[i].is_some() {
1935            continue;
1936        }
1937        if let Some(value) = unwrap_to_string(m) {
1938            roles[i] = Some(if is_immediate_token(m) {
1939                // The grammar wraps this token in IMMEDIATE_TOKEN: the
1940                // lexer glues it to its neighbours (the `.` in a float
1941                // `0.5`). Derive tightness from that fact rather than
1942                // guessing from position.
1943                TokenRole::Immediate
1944            } else if is_word_like(value) {
1945                TokenRole::Keyword
1946            } else if !in_choice && first_content_idx.is_some_and(|fc| i < fc) {
1947                if is_prefix_sigil(value) {
1948                    TokenRole::BracketOpen
1949                } else {
1950                    TokenRole::Operator
1951                }
1952            } else if last_content_idx.is_some_and(|lc| i > lc) {
1953                TokenRole::BracketClose
1954            } else if !in_choice
1955                && string_positions.len() == 1
1956                && content_count == 2
1957                && value.len() == 1
1958            {
1959                TokenRole::Connector
1960            } else {
1961                TokenRole::Operator
1962            });
1963        }
1964    }
1965
1966    // Override: in a REPEAT body's inner SEQ, the first STRING is a
1967    // separator. This is checked by the caller (REPEAT handler), not here.
1968    // But we do store bracket indices for the caller to use.
1969    let _ = (bracket_open_idx, bracket_close_idx);
1970
1971    roles
1972}
1973
1974/// Extract line-comment prefixes from the grammar's extras rules.
1975///
1976/// A line comment is identified by: the rule name is in
1977/// `grammar.extras` AND the rule body structurally matches
1978/// `TOKEN(SEQ [STRING prefix, PATTERN ...])` where the PATTERN
1979/// matches to end-of-line.
1980pub(crate) fn extract_line_comment_prefixes(grammar: &Grammar) -> Vec<String> {
1981    let mut prefixes = Vec::new();
1982    for extra_name in &grammar.extras {
1983        if let Some(rule) = grammar.rules.get(extra_name) {
1984            if let Some(prefix) = extract_line_comment_prefix(rule) {
1985                prefixes.push(prefix);
1986            }
1987        }
1988    }
1989    prefixes
1990}
1991
1992/// Does this production, after resolving SYMBOL references and unwrapping
1993/// transparent wrappers, BEGIN with a newline-only token? Used to detect
1994/// the trailing element of a hard-line-break SEQ (`_soft_line_break`,
1995/// `_newline_token`, a bare `\n` STRING/PATTERN).
1996fn production_is_newline_leading(
1997    grammar: &Grammar,
1998    prod: &Production,
1999    seen: &mut std::collections::HashSet<String>,
2000) -> bool {
2001    match prod {
2002        Production::String { value } | Production::Pattern { value } => {
2003            is_newline_like_pattern(value)
2004        }
2005        Production::Seq { members } => members
2006            .first()
2007            .is_some_and(|m| production_is_newline_leading(grammar, m, seen)),
2008        Production::Symbol { name } => {
2009            if !seen.insert(name.clone()) {
2010                return false;
2011            }
2012            grammar
2013                .rules
2014                .get(name)
2015                .is_some_and(|r| production_is_newline_leading(grammar, r, seen))
2016        }
2017        Production::Token { content }
2018        | Production::ImmediateToken { content }
2019        | Production::Prec { content, .. }
2020        | Production::PrecLeft { content, .. }
2021        | Production::PrecRight { content, .. }
2022        | Production::PrecDynamic { content, .. }
2023        | Production::Field { content, .. }
2024        | Production::Alias { content, .. }
2025        | Production::Reserved { content, .. } => {
2026            production_is_newline_leading(grammar, content, seen)
2027        }
2028        _ => false,
2029    }
2030}
2031
2032/// Collect the leading "hard line break" markers (see
2033/// [`Grammar::trailing_break_markers`]).
2034///
2035/// A production `SEQ[first, .., last]` is a hard-line-break idiom when its
2036/// LAST member is newline-leading. The FIRST member then carries the break
2037/// marker: a CHOICE (or bare element) whose alternatives are either a
2038/// single-character non-alphanumeric STRING (`\`) — a standalone break
2039/// marker — or a whitespace-only PATTERN (`[ \t]+`). The former is
2040/// collected literally; the latter sets the whitespace-sensitivity flag.
2041pub(crate) fn classify_trailing_break_markers(grammar: &Grammar) -> (Vec<String>, bool) {
2042    fn collect_marker_alts(
2043        grammar: &Grammar,
2044        prod: &Production,
2045        lits: &mut Vec<String>,
2046        ws: &mut bool,
2047        seen: &mut std::collections::HashSet<String>,
2048    ) {
2049        match prod {
2050            Production::Choice { members } => {
2051                for m in members {
2052                    collect_marker_alts(grammar, m, lits, ws, seen);
2053                }
2054            }
2055            Production::Symbol { name } => {
2056                // Resolve hidden break-marker symbols (`_whitespace_ge_2`)
2057                // one level, cycle-guarded. A concrete named rule is line
2058                // content in its own right, never a bare break marker.
2059                if let Some(r) = grammar
2060                    .rules
2061                    .get(name)
2062                    .filter(|_| seen.insert(name.clone()))
2063                {
2064                    collect_marker_alts(grammar, r, lits, ws, seen);
2065                }
2066            }
2067            Production::Token { content }
2068            | Production::ImmediateToken { content }
2069            | Production::Prec { content, .. }
2070            | Production::PrecLeft { content, .. }
2071            | Production::PrecRight { content, .. }
2072            | Production::PrecDynamic { content, .. }
2073            | Production::Field { content, .. }
2074            | Production::Alias { content, .. }
2075            | Production::Reserved { content, .. } => {
2076                collect_marker_alts(grammar, content, lits, ws, seen);
2077            }
2078            Production::String { value } => {
2079                // A standalone single-character non-alphanumeric break marker
2080                // (the `\` of a hard line break). Multi-character / word-like
2081                // literals are substantive line content, not break markers.
2082                let mut chars = value.chars();
2083                if let (Some(c), None) = (chars.next(), chars.clone().next()) {
2084                    if !c.is_alphanumeric() && !c.is_whitespace() {
2085                        lits.push(value.clone());
2086                    }
2087                }
2088            }
2089            Production::Pattern { value } => {
2090                // A whitespace-only PATTERN, possibly a top-level alternation
2091                // of whitespace branches (`_whitespace_ge_2 = \t| [ \t]+`).
2092                let ws_only = super::split_top_level_alternation(value)
2093                    .iter()
2094                    .all(|b| is_whitespace_only_pattern(b.trim()));
2095                if ws_only {
2096                    *ws = true;
2097                }
2098            }
2099            _ => {}
2100        }
2101    }
2102
2103    let mut lits: Vec<String> = Vec::new();
2104    let mut ws = false;
2105    for rule in grammar.rules.values() {
2106        if let Production::Seq { members } = rule {
2107            if members.len() >= 2
2108                && production_is_newline_leading(
2109                    grammar,
2110                    members.last().expect("len >= 2"),
2111                    &mut std::collections::HashSet::new(),
2112                )
2113            {
2114                collect_marker_alts(
2115                    grammar,
2116                    &members[0],
2117                    &mut lits,
2118                    &mut ws,
2119                    &mut std::collections::HashSet::new(),
2120                );
2121            }
2122        }
2123    }
2124    lits.sort();
2125    lits.dedup();
2126    (lits, ws)
2127}
2128
2129/// Extract the start symbol (first rule key) from raw grammar.json bytes.
2130///
2131/// tree-sitter's start symbol is the first rule as written, but the
2132/// [`Grammar::rules`] `BTreeMap` reorders alphabetically. We recover the
2133/// original first key with a minimal scan: locate the top-level `"rules"`
2134/// object and read its first JSON string key. Returns an empty string on
2135/// any malformed input (the caller then falls back to no special-casing).
2136fn extract_start_symbol(bytes: &[u8]) -> String {
2137    let Ok(text) = std::str::from_utf8(bytes) else {
2138        return String::new();
2139    };
2140    let Some(rules_at) = text.find("\"rules\"") else {
2141        return String::new();
2142    };
2143    let after = &text[rules_at + "\"rules\"".len()..];
2144    // Skip to the object's opening brace.
2145    let Some(brace) = after.find('{') else {
2146        return String::new();
2147    };
2148    let mut chars = after[brace + 1..].char_indices();
2149    // Find the first string key.
2150    for (_, c) in chars.by_ref() {
2151        if c == '"' {
2152            break;
2153        }
2154        if !c.is_whitespace() {
2155            return String::new();
2156        }
2157    }
2158    let mut key = String::new();
2159    while let Some((_, c)) = chars.next() {
2160        match c {
2161            // Skip the escaped character after a backslash (rule names are
2162            // plain identifiers, but stay robust to escapes).
2163            '\\' => {
2164                chars.next();
2165            }
2166            '"' => return key,
2167            _ => key.push(c),
2168        }
2169    }
2170    String::new()
2171}
2172
2173/// Does this PATTERN value match a bare newline? True when some top-level
2174/// alternation branch is a negated character class (`[^...]`, optionally
2175/// quantified or anchored to a following atom) that does NOT exclude `\n`
2176/// or `\r` — the free-text-content idiom (`liquid`'s `[^{]+`).
2177fn pattern_admits_newline(value: &str) -> bool {
2178    for branch in super::split_top_level_alternation(value) {
2179        let b = branch.trim();
2180        if let Some(rest) = b.strip_prefix("[^") {
2181            if let Some(idx) = rest.find(']') {
2182                let inner = &rest[..idx];
2183                // A negated class admits newline unless it explicitly lists
2184                // a newline atom (`\n` / `\r`).
2185                if !inner.contains("\\n") && !inner.contains("\\r") {
2186                    return true;
2187                }
2188            }
2189        }
2190    }
2191    false
2192}
2193
2194/// Determine whether the grammar's top-level document repeat directly
2195/// admits a free-text content node matching a bare newline (see
2196/// [`Grammar::top_level_text_admits_newline`]).
2197pub(crate) fn classify_top_level_text_admits_newline(grammar: &Grammar) -> bool {
2198    // The start symbol is the FIRST rule as written in grammar.json.
2199    let Some(start_body) = grammar.rules.get(&grammar.start_symbol) else {
2200        return false;
2201    };
2202
2203    // Collect the concrete content kinds that are DIRECT members of the
2204    // start symbol's top-level repeat, descending through SEQ / REPEAT /
2205    // CHOICE / OPTIONAL and resolving hidden (`_`-prefixed) symbols. A
2206    // concrete (non-hidden) symbol terminates the descent: it is a
2207    // document node whose own body we then inspect for free text.
2208    fn collect_content_kinds(
2209        grammar: &Grammar,
2210        prod: &Production,
2211        out: &mut std::collections::HashSet<String>,
2212        seen: &mut std::collections::HashSet<String>,
2213    ) {
2214        match prod {
2215            Production::Seq { members } | Production::Choice { members } => {
2216                for m in members {
2217                    collect_content_kinds(grammar, m, out, seen);
2218                }
2219            }
2220            Production::Repeat { content }
2221            | Production::Repeat1 { content }
2222            | Production::Optional { content }
2223            | Production::Token { content }
2224            | Production::ImmediateToken { content }
2225            | Production::Prec { content, .. }
2226            | Production::PrecLeft { content, .. }
2227            | Production::PrecRight { content, .. }
2228            | Production::PrecDynamic { content, .. }
2229            | Production::Field { content, .. }
2230            | Production::Reserved { content, .. } => {
2231                collect_content_kinds(grammar, content, out, seen);
2232            }
2233            Production::Symbol { name } => {
2234                if name.starts_with('_') {
2235                    // Hidden rule: descend through it (inlined by tree-sitter).
2236                    if seen.insert(name.clone()) {
2237                        if let Some(r) = grammar.rules.get(name) {
2238                            collect_content_kinds(grammar, r, out, seen);
2239                        }
2240                    }
2241                } else {
2242                    // Concrete document node: record, do not descend.
2243                    out.insert(name.clone());
2244                }
2245            }
2246            _ => {}
2247        }
2248    }
2249
2250    let mut kinds = std::collections::HashSet::new();
2251    collect_content_kinds(
2252        grammar,
2253        start_body,
2254        &mut kinds,
2255        &mut std::collections::HashSet::new(),
2256    );
2257
2258    // A content kind admits a trailing newline when its body is (or is a
2259    // REPEAT1 of) a free-text PATTERN matching a bare newline.
2260    fn body_admits_newline_text(prod: &Production) -> bool {
2261        match prod {
2262            Production::Pattern { value } => pattern_admits_newline(value),
2263            Production::Repeat1 { content } | Production::Repeat { content } => {
2264                body_admits_newline_text(content)
2265            }
2266            Production::Choice { members } => members.iter().any(body_admits_newline_text),
2267            Production::Token { content } | Production::ImmediateToken { content } => {
2268                body_admits_newline_text(content)
2269            }
2270            _ => false,
2271        }
2272    }
2273
2274    kinds
2275        .iter()
2276        .any(|k| grammar.rules.get(k).is_some_and(body_admits_newline_text))
2277}
2278
2279// ═══════════════════════════════════════════════════════════════════
2280// Format policy
2281/// Classify external scanner tokens as layout actions based on
2282/// tree-sitter naming conventions. These conventions are ecosystem-
2283/// wide, not language-specific: every indent-based grammar uses
2284/// `_indent`/`_dedent`, every ASI grammar uses `_automatic_semicolon`.
2285pub(crate) fn classify_external_layout_tokens(grammar: &mut Grammar) {
2286    // External tokens have no grammar rule. We identify them by
2287    // checking which hidden symbols are NOT in grammar.rules.
2288    // Then classify by naming convention.
2289    //
2290    // This runs after external_alias_map is built, so tokens with
2291    // known text are already handled. Layout tokens are the remainder.
2292    let all_hidden_refs = collect_all_symbol_refs(&grammar.rules);
2293    for name in &all_hidden_refs {
2294        if !name.starts_with('_') || grammar.rules.contains_key(name) {
2295            continue;
2296        }
2297        if grammar.external_alias_map.contains_key(name) {
2298            continue;
2299        }
2300        if name == "_indent" || name.ends_with("_indent") {
2301            grammar.external_indent_opens.insert(name.clone());
2302        } else if name == "_dedent" || name.ends_with("_dedent") {
2303            grammar.external_indent_closes.insert(name.clone());
2304        } else if name.contains("line_ending")
2305            || name.contains("newline")
2306            || name.ends_with("_or_eof")
2307            // `_automatic_separator` is the tree-sitter ASI convention for a
2308            // scanner-inserted statement terminator that is a NEWLINE (V's
2309            // statement separator between `*ap = size` and the prior stmt).
2310            // Unclassified, it emitted nothing and merged adjacent statements
2311            // onto one line, so `& u64(a)` `*ap` re-lexed as a multiplication.
2312            || name.contains("automatic_separator")
2313            // The tree-sitter ASI / layout-rule families surface a statement
2314            // terminator that the scanner inserts where the source wrote a
2315            // NEWLINE (or end-of-construct), never a literal `;` -- the written
2316            // `;` is always a separate STRING token (`_semicolon =
2317            // CHOICE[_automatic_semicolon, ";"]` in JS/perl, or no `;` at all
2318            // in kotlin where `_semi = _automatic_semicolon`). Emitting these
2319            // as a literal `;` corrupts content: kotlin `enum {...}` (the
2320            // mandatory `source_file` `_semi` after the class) accreted a
2321            // spurious trailing `;`. They are newlines.
2322            || name.contains("automatic_semicolon")
2323            || name.contains("layout_semicolon")
2324        {
2325            grammar.external_newlines.insert(name.clone());
2326        } else if name.contains("semicolon") {
2327            grammar.external_semicolons.insert(name.clone());
2328        }
2329    }
2330}
2331
2332/// Identify external scanner tokens that bracket content, derived from
2333/// grammar structure: a rule whose (unwrapped) body is a SEQ whose first
2334/// and last members are external SYMBOLs (no grammar rule of their own)
2335/// with content in between is a delimiter pair around that content. The
2336/// canonical case is `string = SEQ[string_start, REPEAT(content),
2337/// string_end]`: the delimiters must hug the content (`'hello'`), and
2338/// the grammar states which externals they are without naming
2339/// conventions.
2340pub(crate) fn classify_external_bracket_delimiters(grammar: &mut Grammar) {
2341    let is_external = |name: &str| !grammar.rules.contains_key(name);
2342    let mut opens = std::collections::HashSet::new();
2343    let mut closes = std::collections::HashSet::new();
2344    let mut content_kinds = std::collections::HashSet::new();
2345    for rule in grammar.rules.values() {
2346        let Production::Seq { members } = unwrap_to_seq(rule) else {
2347            continue;
2348        };
2349        if members.len() < 3 {
2350            continue;
2351        }
2352        let (Some(first), Some(last)) = (members.first(), members.last()) else {
2353            continue;
2354        };
2355        // The delimiter may be a bare external SYMBOL or an anonymous ALIAS
2356        // renaming one (ruby `ALIAS{_string_start, value:"\""}`); unwrap to
2357        // the underlying external name in both cases.
2358        let (Some(open), Some(close)) = (
2359            delimiter_external_name(first),
2360            delimiter_external_name(last),
2361        ) else {
2362            continue;
2363        };
2364        if open == close || !is_external(open) || !is_external(close) {
2365            continue;
2366        }
2367        // Content between the delimiters (a REPEAT of string content, an
2368        // interpolation choice, …) — at least one non-delimiter member.
2369        opens.insert(open.to_owned());
2370        closes.insert(close.to_owned());
2371        // The visible (non-`_`) external symbols reachable from the members
2372        // *between* the delimiters are the captured-content tokens (ruby
2373        // `string_content`/`heredoc_content`, regex content). They carry the
2374        // literal source bytes and must emit tight; collect them, expanding
2375        // hidden-rule references (ruby wraps the content in the hidden
2376        // `_literal_contents = REPEAT1(CHOICE[string_content, …])`).
2377        for member in &members[1..members.len() - 1] {
2378            collect_visible_external_content(grammar, member, &mut content_kinds, &mut Vec::new());
2379        }
2380    }
2381    grammar.external_bracket_opens = opens;
2382    grammar.external_bracket_closes = closes;
2383    grammar.external_content_kinds = content_kinds;
2384}
2385
2386/// Derive the emitted text for an external *closing* delimiter whose
2387/// matching *opener* is a literal `STRING`. A rule shaped `SEQ[STRING q,
2388/// body.., EXTERNAL close]` opens with a grammar literal but closes with a
2389/// scanner external that has no rule and no otherwise-resolvable text
2390/// (TOML's `_multiline_basic_string` / `_multiline_literal_string`). Such a
2391/// string closes with the same delimiter it opens with, so the external
2392/// close emits the opener's literal. Matches only the asymmetric
2393/// STRING-open / external-close shape (the all-external and all-STRING
2394/// shapes are handled by `classify_external_bracket_delimiters` /
2395/// `classify_string_content_kinds`); the close must be an external with no
2396/// rule, so ordinary bracketed constructs do not match.
2397pub(crate) fn classify_external_close_text(grammar: &mut Grammar) {
2398    let is_external = |name: &str| !grammar.rules.contains_key(name);
2399    let mut close_text = std::collections::HashMap::new();
2400    for rule in grammar.rules.values() {
2401        let Production::Seq { members } = unwrap_to_seq(rule) else {
2402            continue;
2403        };
2404        if members.len() < 3 {
2405            continue;
2406        }
2407        let (Some(first), Some(last)) = (members.first(), members.last()) else {
2408            continue;
2409        };
2410        // Opener: a literal STRING (possibly wrapped in a token/precedence
2411        // node) that is a *quote delimiter* — its first character is a
2412        // string quote (`"`, `'`, or a backtick). A keyword opener (FIRRTL
2413        // `memory = SEQ["mem", .., _dedent]`) is NOT a string delimiter, so
2414        // its trailing external is a layout token, not a closing quote.
2415        let Some(open) = string_literal_value(first) else {
2416            continue;
2417        };
2418        if !open.starts_with(['"', '\'', '`']) {
2419            continue;
2420        }
2421        // Closer: a bare external SYMBOL with no rule that is NOT a layout
2422        // token (a `_dedent` / `_newline` / `_indent` closing an indent
2423        // block or terminating a line is structural layout, not a string
2424        // terminator — emitting the open quote in its place corrupts the
2425        // block structure).
2426        let Some(close) = delimiter_external_name(last) else {
2427            continue;
2428        };
2429        if !is_external(close)
2430            || grammar.external_indent_closes.contains(close)
2431            || grammar.external_indent_opens.contains(close)
2432            || grammar.external_newlines.contains(close)
2433            || close.contains("indent")
2434            || close.contains("dedent")
2435            || close.contains("newline")
2436            || close.contains("line_ending")
2437            || close.ends_with("_or_eof")
2438        {
2439            continue;
2440        }
2441        close_text.insert(close.to_owned(), open.to_owned());
2442    }
2443    grammar.external_close_text = close_text;
2444}
2445
2446/// The literal `STRING` value a delimiter member resolves to, unwrapping
2447/// token / immediate-token / precedence wrappers. `None` for any other
2448/// shape.
2449fn string_literal_value(prod: &Production) -> Option<&str> {
2450    match prod {
2451        Production::String { value } => Some(value.as_str()),
2452        Production::Token { content }
2453        | Production::ImmediateToken { content }
2454        | Production::Prec { content, .. }
2455        | Production::PrecLeft { content, .. }
2456        | Production::PrecRight { content, .. }
2457        | Production::PrecDynamic { content, .. }
2458        | Production::Reserved { content, .. } => string_literal_value(content),
2459        _ => None,
2460    }
2461}
2462
2463/// The external scanner-token name a delimiter member resolves to: either
2464/// a bare `SYMBOL` or an anonymous `ALIAS` renaming one (`ALIAS{
2465/// _string_start, value: "\""}`). `None` for any other shape.
2466fn delimiter_external_name(prod: &Production) -> Option<&str> {
2467    match prod {
2468        Production::Symbol { name } => Some(name.as_str()),
2469        Production::Alias {
2470            content,
2471            named: false,
2472            ..
2473        } => external_symbol_name(content),
2474        _ => None,
2475    }
2476}
2477
2478/// Collect the visible (non-`_`-prefixed) external scanner tokens reachable
2479/// from a string-body production, expanding hidden-rule references (the
2480/// content is often wrapped in a hidden `_literal_contents` rule). A visible
2481/// external has no rule and no leading underscore. Cycle-guarded on the
2482/// hidden rules visited.
2483fn collect_visible_external_content<'g>(
2484    grammar: &'g Grammar,
2485    prod: &'g Production,
2486    out: &mut std::collections::HashSet<String>,
2487    visiting: &mut Vec<&'g str>,
2488) {
2489    match prod {
2490        Production::Symbol { name } => {
2491            if !grammar.rules.contains_key(name) {
2492                if !name.starts_with('_') {
2493                    out.insert(name.clone());
2494                }
2495            } else if name.starts_with('_') && !visiting.contains(&name.as_str()) {
2496                // Expand a hidden rule reference to reach its content tokens.
2497                visiting.push(name.as_str());
2498                if let Some(rule) = grammar.rules.get(name) {
2499                    collect_visible_external_content(grammar, rule, out, visiting);
2500                }
2501                visiting.pop();
2502            }
2503        }
2504        Production::Choice { members } | Production::Seq { members } => {
2505            for m in members {
2506                collect_visible_external_content(grammar, m, out, visiting);
2507            }
2508        }
2509        Production::Repeat { content }
2510        | Production::Repeat1 { content }
2511        | Production::Optional { content }
2512        | Production::Token { content }
2513        | Production::ImmediateToken { content }
2514        | Production::Prec { content, .. }
2515        | Production::PrecLeft { content, .. }
2516        | Production::PrecRight { content, .. }
2517        | Production::PrecDynamic { content, .. }
2518        | Production::Reserved { content, .. }
2519        | Production::Field { content, .. }
2520        | Production::Alias { content, .. } => {
2521            collect_visible_external_content(grammar, content, out, visiting);
2522        }
2523        Production::String { .. } | Production::Pattern { .. } | Production::Blank => {}
2524    }
2525}
2526
2527/// Classify the *content kinds* of literal-quote-delimited string rules:
2528/// rules shaped `SEQ[STRING q, body.., STRING q]` where the same quote
2529/// literal opens and closes (CSS `string_value`, C# / Java
2530/// `string_literal`). The body is a `REPEAT`/`REPEAT1` over a `CHOICE` of
2531/// content pieces (`string_content` aliased over a PATTERN,
2532/// `escape_sequence`); each piece carries verbatim source bytes and must
2533/// emit *tight* on both sides so the layout pass does not accrete a space
2534/// between adjacent content / escape leaves (`"ab\t"`, not `"ab \t "`).
2535///
2536/// This is the literal-delimiter twin of
2537/// [`classify_external_bracket_delimiters`], which only matches *external*
2538/// (scanner) delimiters and so skips these STRING-quoted rules. Both feed
2539/// the same tight-content emission path. The match is anchored on a
2540/// *matched quote pair* and a `REPEAT` body, so it does not fire for
2541/// ordinary bracketed constructs (`( … )`, `{ … }`) whose opener and
2542/// closer differ, nor for indent blocks (no STRING delimiter).
2543/// The `SEQ` alternatives of a rule: a single `SEQ` yields itself; a
2544/// top-level `CHOICE` yields each alternative (unwrapped through
2545/// precedence / token wrappers). The CSS `string_value` is a
2546/// `CHOICE[SEQ['…'], SEQ["…"]]` (one alternative per quote style), so a
2547/// string classifier must look at each branch.
2548fn seq_alternatives(rule: &Production) -> Vec<&Production> {
2549    match unwrap_to_seq(rule) {
2550        Production::Choice { members } => members.iter().map(unwrap_to_seq).collect(),
2551        other => vec![other],
2552    }
2553}
2554
2555/// True when a string-body member is *unbounded*: it is directly a
2556/// `REPEAT`/`REPEAT1`, or it references a named symbol whose own rule is an
2557/// unbounded content rule (reached through `CHOICE`/`BLANK`/`OPTIONAL`/`FIELD`
2558/// wrappers). This lets a `string` whose body delegates the open-ended content
2559/// to a separate rule (query's `string_content = REPEAT1(...)`) still register
2560/// as a quoted-string shape. One hop only, to keep the classifier cheap and
2561/// avoid following arbitrary rule graphs.
2562fn member_is_unbounded_body(
2563    prod: &Production,
2564    rules: &std::collections::BTreeMap<String, Production>,
2565) -> bool {
2566    match prod {
2567        Production::Repeat { .. } | Production::Repeat1 { .. } => true,
2568        Production::Symbol { name } => rules
2569            .get(name)
2570            .is_some_and(|r| matches!(r, Production::Repeat { .. } | Production::Repeat1 { .. })),
2571        Production::Choice { members } | Production::Seq { members } => {
2572            members.iter().any(|m| member_is_unbounded_body(m, rules))
2573        }
2574        Production::Optional { content } | Production::Field { content, .. } => {
2575            member_is_unbounded_body(content, rules)
2576        }
2577        _ => false,
2578    }
2579}
2580
2581pub(crate) fn classify_string_content_kinds(grammar: &mut Grammar) {
2582    let mut accum = StringContentAccum::new();
2583    for rule in grammar.rules.values() {
2584        for seq in seq_alternatives(rule) {
2585            let Production::Seq { members } = seq else {
2586                continue;
2587            };
2588            if members.len() < 3 {
2589                continue;
2590            }
2591            // The opener is the first member and must be a literal quote
2592            // STRING (`'…'`, `"…"`).
2593            let Some(first @ Production::String { value: open }) = members.first() else {
2594                continue;
2595            };
2596            // Only a genuine *quote* delimiter (`'`, `"`, `` ` ``) opens a
2597            // string body. A bracket-like paired STRING (`|…|` block params,
2598            // `(…)`) is NOT a string: its inner symbols (`identifier`) are
2599            // structural children, not verbatim content, and must not be
2600            // emitted tight. (The old direct-`REPEAT` body guard happened to
2601            // exclude `|…|`; once the unbounded-body check follows a symbol
2602            // hop, the quote guard is what keeps the classifier sound.)
2603            if !is_quote_delimiter(first) {
2604                continue;
2605            }
2606            // The closer is the *last member whose (possibly wrapped) STRING
2607            // equals the opener*; a trailing suffix may follow (C#'s
2608            // `string_literal` ends with an optional `(u|U)8` encoding CHOICE
2609            // after the close quote). The close may be wrapped in an
2610            // `IMMEDIATE_TOKEN` (the scanner-tight close common to many string
2611            // rules, e.g. query `SEQ["\"", body, IMMEDIATE_TOKEN("\"")]`), so
2612            // unwrap before comparing.
2613            let Some(close_idx) = members
2614                .iter()
2615                .rposition(|m| unwrap_to_string(m) == Some(open.as_str()))
2616            else {
2617                continue;
2618            };
2619            if close_idx == 0 {
2620                continue;
2621            }
2622            // The body must be unbounded (the open-ended string body),
2623            // distinguishing a quoted string from a fixed two-quote token. A
2624            // body member is unbounded when it is itself a REPEAT/REPEAT1 or
2625            // when it references a named symbol whose own rule is an unbounded
2626            // content rule (query's `string` body is `CHOICE[string_content |
2627            // BLANK]` where `string_content = REPEAT1(...)`). Collect the named
2628            // content kinds it can yield, then commit only if the body confirmed
2629            // the string shape.
2630            let mut has_repeat_body = false;
2631            for member in &members[1..close_idx] {
2632                if member_is_unbounded_body(member, &grammar.rules) {
2633                    has_repeat_body = true;
2634                }
2635                collect_string_content_kinds(member, &mut accum);
2636            }
2637            if has_repeat_body {
2638                accum.commit();
2639            } else {
2640                accum.clear_in_rule_guard();
2641            }
2642        }
2643    }
2644    grammar.string_content_kinds = accum.into_set();
2645}
2646
2647/// Accumulator that only retains content kinds collected from a body that
2648/// actually had a `REPEAT` (an unbounded string body). A small helper so
2649/// `classify_string_content_kinds` can discard a false match without
2650/// losing kinds collected from other rules.
2651struct StringContentAccum {
2652    confirmed: std::collections::HashSet<String>,
2653    pending: std::collections::HashSet<String>,
2654}
2655
2656impl StringContentAccum {
2657    fn new() -> Self {
2658        Self {
2659            confirmed: std::collections::HashSet::new(),
2660            pending: std::collections::HashSet::new(),
2661        }
2662    }
2663    fn insert(&mut self, kind: String) {
2664        self.pending.insert(kind);
2665    }
2666    fn commit(&mut self) {
2667        for k in self.pending.drain() {
2668            self.confirmed.insert(k);
2669        }
2670    }
2671    fn clear_in_rule_guard(&mut self) {
2672        self.pending.clear();
2673    }
2674    fn into_set(mut self) -> std::collections::HashSet<String> {
2675        self.commit();
2676        self.confirmed
2677    }
2678}
2679
2680/// Recursively collect the named content kinds reachable inside a string
2681/// body: named `ALIAS` values (`string_content` over a PATTERN) and named
2682/// `SYMBOL` references (`escape_sequence`). Hidden (`_`-prefixed) symbols
2683/// are not vertices, so they are skipped; concrete sub-rules are not
2684/// recursed into (they are their own vertices with their own layout).
2685fn collect_string_content_kinds(prod: &Production, out: &mut StringContentAccum) {
2686    match prod {
2687        Production::Alias {
2688            value, named: true, ..
2689        } => out.insert(value.clone()),
2690        Production::Symbol { name } if !name.starts_with('_') => out.insert(name.clone()),
2691        Production::Choice { members } | Production::Seq { members } => {
2692            for m in members {
2693                collect_string_content_kinds(m, out);
2694            }
2695        }
2696        Production::Repeat { content }
2697        | Production::Repeat1 { content }
2698        | Production::Optional { content }
2699        | Production::Token { content }
2700        | Production::ImmediateToken { content }
2701        | Production::Prec { content, .. }
2702        | Production::PrecLeft { content, .. }
2703        | Production::PrecRight { content, .. }
2704        | Production::PrecDynamic { content, .. }
2705        | Production::Reserved { content, .. }
2706        | Production::Field { content, .. } => collect_string_content_kinds(content, out),
2707        _ => {}
2708    }
2709}
2710
2711/// Identify indented-block rules whose opening `_indent` is supplied by
2712/// a hidden parent: the rule's body references an external indent-close
2713/// token (`_dedent`) but no indent-open token. Python's `block = SEQ[
2714/// REPEAT(_statement), _dedent]` is the canonical case — the matching
2715/// `_indent` sits in the hidden `_suite` wrapper, which is not a vertex,
2716/// so the parser hands the emitter a bare `block` and the opening indent
2717/// must be synthesized.
2718pub(crate) fn classify_synthetic_indent_rules(grammar: &mut Grammar) {
2719    if grammar.external_indent_closes.is_empty() {
2720        return;
2721    }
2722    let mut rules = std::collections::HashSet::new();
2723    for (name, rule) in &grammar.rules {
2724        let symbols = referenced_symbols(rule);
2725        let references_close = symbols
2726            .iter()
2727            .any(|s| grammar.external_indent_closes.contains(*s));
2728        let references_open = symbols
2729            .iter()
2730            .any(|s| grammar.external_indent_opens.contains(*s));
2731        if references_close && !references_open {
2732            rules.insert(name.clone());
2733        }
2734    }
2735    grammar.synthetic_indent_rules = rules;
2736}
2737
2738/// Collect named terminal kinds whose underlying `PATTERN` can match a
2739/// leading space (see `pattern_absorbs_leading_space`). Two shapes
2740/// produce such a kind on the schema:
2741///
2742/// - `ALIAS { content: PATTERN p, named: true, value: K }` — the pattern is
2743///   renamed to kind `K` (INI's `setting_value`, `setting_name`, …).
2744/// - a named rule `K = PATTERN p` — the rule itself is a bare terminal.
2745///
2746/// In both cases the captured text round-trips through `K`, so a layout
2747/// space emitted before it would be absorbed and grow. The pattern is read
2748/// through token/precedence wrappers via [`terminal_pattern_of`].
2749pub(crate) fn classify_leading_space_terminals(
2750    grammar: &Grammar,
2751) -> std::collections::HashSet<String> {
2752    let mut out = std::collections::HashSet::new();
2753
2754    // Named rules that are themselves a bare terminal pattern.
2755    for (name, rule) in &grammar.rules {
2756        if let Some(p) = terminal_pattern_of(rule) {
2757            if pattern_absorbs_leading_space(p) {
2758                out.insert(name.clone());
2759            }
2760        }
2761    }
2762
2763    // Named aliases wrapping a terminal pattern.
2764    fn walk(prod: &Production, out: &mut std::collections::HashSet<String>) {
2765        match prod {
2766            Production::Alias {
2767                content,
2768                named: true,
2769                value,
2770            } => {
2771                if let Some(p) = terminal_pattern_of(content) {
2772                    if pattern_absorbs_leading_space(p) {
2773                        out.insert(value.clone());
2774                    }
2775                }
2776                walk(content, out);
2777            }
2778            Production::Alias { content, .. }
2779            | Production::Repeat { content }
2780            | Production::Repeat1 { content }
2781            | Production::Optional { content }
2782            | Production::Field { content, .. }
2783            | Production::Token { content }
2784            | Production::ImmediateToken { content }
2785            | Production::Prec { content, .. }
2786            | Production::PrecLeft { content, .. }
2787            | Production::PrecRight { content, .. }
2788            | Production::PrecDynamic { content, .. }
2789            | Production::Reserved { content, .. } => walk(content, out),
2790            Production::Seq { members } | Production::Choice { members } => {
2791                for m in members {
2792                    walk(m, out);
2793                }
2794            }
2795            _ => {}
2796        }
2797    }
2798    for rule in grammar.rules.values() {
2799        walk(rule, &mut out);
2800    }
2801    out
2802}
2803
2804/// Classify named alias values whose ALIAS content is an `IMMEDIATE_TOKEN`
2805/// wrapping a bare `PATTERN` content fragment AND which appears as the
2806/// quote-pair-delimited *body* of a string/character literal (C
2807/// `char_literal`'s `character` = `[^\n']`, `string_literal`'s
2808/// `string_content` = `[^\\"\n]+`). Such a token is lexed only with no
2809/// preceding whitespace; since the alias value has no grammar rule, the
2810/// rule-head `IMMEDIATE_TOKEN` no-space check never fires, so the emitter
2811/// records these kinds to emit them tight (`'hey'` stays tight rather than
2812/// re-spacing to `' h e y'`, whose spaces re-parse as extra `character`s).
2813///
2814/// Two structural conditions, BOTH required:
2815///
2816/// 1. The immediate-token content reduces to a bare `PATTERN` (a character-
2817///    class content fragment), NOT a `STRING` / `CHOICE`-of-strings. Julia
2818///    spells a context-sensitive keyword-as-identifier as
2819///    `ALIAS{IMMEDIATE_TOKEN CHOICE[STRING "module", …], value:"identifier"}`,
2820///    a word-like token that must keep its surrounding whitespace
2821///    (`function foo`, not `functionfoo`). `terminal_pattern_of` returns
2822///    `Some` only for the bare-PATTERN shape, so it draws exactly this line.
2823///
2824/// 2. The alias is a member (through repeat/optional/choice wrappers) of a
2825///    `SEQ` whose first and last members are *quote delimiters* (a `STRING`
2826///    or `CHOICE`-of-`STRING`s ending in `'`/`"`/`` ` ``): i.e. it is the
2827///    delimited body of a quote pair. This excludes content fragments that
2828///    are numeric brace-range / bracket bodies whose `IMMEDIATE_TOKEN` is a
2829///    scanner immediacy fact LOCAL to that one construct, not a property of
2830///    the kind everywhere: bash `brace_expression = SEQ["{", number, "..",
2831///    number, "}"]` aliases `\d+` to `number`, but the very same `number`
2832///    kind is also a freestanding command argument (`exit 1`, `echo 1`)
2833///    where it must keep its leading space. The `{`/`}` brackets are not
2834///    quote delimiters, so `number` is not collected, and the argument
2835///    `number` is left spaced. Without this guard, db9b280 tightened every
2836///    `number` and regressed bash (`exit 1` → `exit1`).
2837pub(crate) fn classify_immediate_token_alias_kinds(
2838    grammar: &Grammar,
2839) -> std::collections::HashSet<String> {
2840    let mut out = std::collections::HashSet::new();
2841    // Collect every immediate-token bare-PATTERN alias reachable inside a
2842    // production (used for the interior body of a detected quote pair).
2843    fn collect_aliases(prod: &Production, out: &mut std::collections::HashSet<String>) {
2844        match prod {
2845            Production::Alias {
2846                content,
2847                named: true,
2848                value,
2849            } => {
2850                if is_immediate_token(content) && terminal_pattern_of(content).is_some() {
2851                    out.insert(value.clone());
2852                }
2853                collect_aliases(content, out);
2854            }
2855            Production::Alias { content, .. }
2856            | Production::Repeat { content }
2857            | Production::Repeat1 { content }
2858            | Production::Optional { content }
2859            | Production::Field { content, .. }
2860            | Production::Token { content }
2861            | Production::ImmediateToken { content }
2862            | Production::Prec { content, .. }
2863            | Production::PrecLeft { content, .. }
2864            | Production::PrecRight { content, .. }
2865            | Production::PrecDynamic { content, .. }
2866            | Production::Reserved { content, .. } => collect_aliases(content, out),
2867            Production::Seq { members } | Production::Choice { members } => {
2868                for m in members {
2869                    collect_aliases(m, out);
2870                }
2871            }
2872            _ => {}
2873        }
2874    }
2875    // Walk every production; whenever a `SEQ` is bracketed by quote
2876    // delimiters (a string/char literal), harvest the immediate-token
2877    // aliases from its interior body.
2878    fn walk(prod: &Production, out: &mut std::collections::HashSet<String>) {
2879        match prod {
2880            Production::Seq { members } => {
2881                if members.len() >= 2
2882                    && is_quote_delimiter(&members[0])
2883                    && is_quote_delimiter(&members[members.len() - 1])
2884                {
2885                    for m in &members[1..members.len() - 1] {
2886                        collect_aliases(m, out);
2887                    }
2888                }
2889                for m in members {
2890                    walk(m, out);
2891                }
2892            }
2893            Production::Choice { members } => {
2894                for m in members {
2895                    walk(m, out);
2896                }
2897            }
2898            Production::Alias { content, .. }
2899            | Production::Repeat { content }
2900            | Production::Repeat1 { content }
2901            | Production::Optional { content }
2902            | Production::Field { content, .. }
2903            | Production::Token { content }
2904            | Production::ImmediateToken { content }
2905            | Production::Prec { content, .. }
2906            | Production::PrecLeft { content, .. }
2907            | Production::PrecRight { content, .. }
2908            | Production::PrecDynamic { content, .. }
2909            | Production::Reserved { content, .. } => walk(content, out),
2910            _ => {}
2911        }
2912    }
2913    for rule in grammar.rules.values() {
2914        walk(rule, &mut out);
2915    }
2916    out
2917}
2918
2919/// Classify named terminal kinds whose underlying `PATTERN` runs to the end
2920/// of the line (`hash_bang_line = #!.*`, `shebang = #!...`). These are
2921/// rest-of-line terminals: the layout pass emits a newline after them so the
2922/// next sibling does not merge into the token on re-parse. The generalised
2923/// counterpart of `line_comment_prefixes` (a line comment is a STRING prefix
2924/// then a rest-of-line PATTERN; here the whole token is the pattern).
2925pub(crate) fn classify_line_rest_kinds(grammar: &Grammar) -> std::collections::HashSet<String> {
2926    let mut out = std::collections::HashSet::new();
2927
2928    // Named rules that are themselves a bare rest-of-line pattern.
2929    for (name, rule) in &grammar.rules {
2930        if let Some(p) = terminal_pattern_of(rule) {
2931            if is_rest_of_line_pattern(p) {
2932                out.insert(name.clone());
2933            }
2934        }
2935    }
2936
2937    // Named aliases wrapping a rest-of-line pattern.
2938    fn walk(prod: &Production, out: &mut std::collections::HashSet<String>) {
2939        match prod {
2940            Production::Alias {
2941                content,
2942                named: true,
2943                value,
2944            } => {
2945                if let Some(p) = terminal_pattern_of(content) {
2946                    if is_rest_of_line_pattern(p) {
2947                        out.insert(value.clone());
2948                    }
2949                }
2950                walk(content, out);
2951            }
2952            Production::Alias { content, .. }
2953            | Production::Repeat { content }
2954            | Production::Repeat1 { content }
2955            | Production::Optional { content }
2956            | Production::Field { content, .. }
2957            | Production::Token { content }
2958            | Production::ImmediateToken { content }
2959            | Production::Prec { content, .. }
2960            | Production::PrecLeft { content, .. }
2961            | Production::PrecRight { content, .. }
2962            | Production::PrecDynamic { content, .. }
2963            | Production::Reserved { content, .. } => walk(content, out),
2964            Production::Seq { members } | Production::Choice { members } => {
2965                for m in members {
2966                    walk(m, out);
2967                }
2968            }
2969            _ => {}
2970        }
2971    }
2972    for rule in grammar.rules.values() {
2973        walk(rule, &mut out);
2974    }
2975    out
2976}
panproto_parse/emit_pretty/grammar.rs

panproto_parse/emit_pretty/
grammar.rs