panproto_parse/emit_pretty/grammar.rs
1#![allow(
2 clippy::module_name_repetitions,
3 clippy::too_many_lines,
4 clippy::too_many_arguments,
5 clippy::map_unwrap_or,
6 clippy::option_if_let_else,
7 clippy::elidable_lifetime_names,
8 clippy::items_after_statements,
9 clippy::needless_pass_by_value,
10 clippy::single_match_else,
11 clippy::manual_let_else,
12 clippy::match_same_arms,
13 clippy::missing_const_for_fn,
14 clippy::single_char_pattern,
15 clippy::naive_bytecount,
16 clippy::expect_used,
17 clippy::redundant_pub_crate,
18 clippy::used_underscore_binding,
19 clippy::redundant_field_names,
20 clippy::struct_field_names,
21 clippy::redundant_else,
22 clippy::similar_names
23)]
24
25//! `emit_pretty::grammar` (Phase A decomposition).
26
27use super::{
28 BTreeMap, Deserialize, ParseError, collect_all_symbol_refs, external_symbol_name,
29 extract_line_comment_prefix, has_repeat_recursive, is_immediate_token, is_newline_like_pattern,
30 is_prefix_sigil, is_quote_delimiter, is_rest_of_line_pattern, is_whitespace_only_pattern,
31 is_word_like, kind_satisfies_symbol, leading_optional_sign, literal_strings,
32 matching_close_bracket, pattern_absorbs_leading_space, referenced_symbols, terminal_pattern_of,
33 unwrap_to_seq, unwrap_to_string,
34};
35
36// ═══════════════════════════════════════════════════════════════════
37// Grammar JSON model
38// ═══════════════════════════════════════════════════════════════════
39
40/// A single tree-sitter production rule.
41///
42/// Mirrors the shape emitted by `tree-sitter generate`: every node has
43/// a `type` discriminator that selects a structural variant. The
44/// untyped subset (`PATTERN`, `STRING`, `SYMBOL`, `BLANK`) handles
45/// terminals; the structural subset (`SEQ`, `CHOICE`, `REPEAT`,
46/// `REPEAT1`, `OPTIONAL`, `FIELD`, `ALIAS`, `TOKEN`,
47/// `IMMEDIATE_TOKEN`, `PREC*`) builds composite productions.
48#[derive(Debug, Clone, Deserialize)]
49#[serde(tag = "type")]
50#[non_exhaustive]
51pub enum Production {
52 /// Concatenation of productions.
53 #[serde(rename = "SEQ")]
54 Seq {
55 /// Ordered members; each is emitted in turn.
56 members: Vec<Self>,
57 },
58 /// Alternation between productions.
59 #[serde(rename = "CHOICE")]
60 Choice {
61 /// Alternatives; the walker picks one based on the schema's
62 /// children and constraints.
63 members: Vec<Self>,
64 },
65 /// Zero-or-more repetition.
66 #[serde(rename = "REPEAT")]
67 Repeat {
68 /// The repeated body.
69 content: Box<Self>,
70 },
71 /// One-or-more repetition.
72 #[serde(rename = "REPEAT1")]
73 Repeat1 {
74 /// The repeated body.
75 content: Box<Self>,
76 },
77 /// Optional inclusion (zero or one).
78 ///
79 /// Tree-sitter usually emits `OPTIONAL` as `CHOICE { content,
80 /// BLANK }`, but recent generator versions also emit explicit
81 /// `OPTIONAL` nodes; both shapes are accepted.
82 #[serde(rename = "OPTIONAL")]
83 Optional {
84 /// The optional body.
85 content: Box<Self>,
86 },
87 /// Reference to another rule by name.
88 #[serde(rename = "SYMBOL")]
89 Symbol {
90 /// Name of the referenced rule (matches a vertex kind on the
91 /// schema side).
92 name: String,
93 },
94 /// Literal token bytes.
95 #[serde(rename = "STRING")]
96 String {
97 /// The literal token. Emitted verbatim.
98 value: String,
99 },
100 /// Regex-matched terminal.
101 ///
102 /// At parse time this matches arbitrary bytes; at emit time the
103 /// walker substitutes a `literal-value` constraint when present
104 /// and falls back to a placeholder otherwise.
105 #[serde(rename = "PATTERN")]
106 Pattern {
107 /// The original regex.
108 value: String,
109 },
110 /// The empty production. Emits nothing.
111 #[serde(rename = "BLANK")]
112 Blank,
113 /// Named field over a content production.
114 ///
115 /// The field `name` matches an edge kind on the schema side; the
116 /// walker resolves the corresponding child vertex and recurses
117 /// into `content` with that child as context.
118 #[serde(rename = "FIELD")]
119 Field {
120 /// Field name (matches edge kind).
121 name: String,
122 /// The contents of the field.
123 content: Box<Self>,
124 },
125 /// An aliased production.
126 ///
127 /// `value` records the parser-visible kind; the walker emits
128 /// `content` and ignores the alias rename.
129 #[serde(rename = "ALIAS")]
130 Alias {
131 /// The aliased content.
132 content: Box<Self>,
133 /// Whether the alias is a named node.
134 #[serde(default)]
135 named: bool,
136 /// The alias's surface name.
137 #[serde(default)]
138 value: String,
139 },
140 /// A token wrapper.
141 ///
142 /// Tree-sitter uses `TOKEN` to mark a sub-rule as a single
143 /// lexical token; the walker emits the inner content unchanged.
144 #[serde(rename = "TOKEN")]
145 Token {
146 /// The wrapped content.
147 content: Box<Self>,
148 },
149 /// An immediate-token wrapper (no preceding whitespace).
150 ///
151 /// Treated like [`Production::Token`] for emit purposes.
152 #[serde(rename = "IMMEDIATE_TOKEN")]
153 ImmediateToken {
154 /// The wrapped content.
155 content: Box<Self>,
156 },
157 /// Precedence wrapper.
158 #[serde(rename = "PREC")]
159 Prec {
160 /// Precedence value (numeric or string). Ignored at emit time.
161 #[allow(dead_code)]
162 value: serde_json::Value,
163 /// The wrapped content.
164 content: Box<Self>,
165 },
166 /// Left-associative precedence wrapper.
167 #[serde(rename = "PREC_LEFT")]
168 PrecLeft {
169 /// Precedence value. Ignored at emit time.
170 #[allow(dead_code)]
171 value: serde_json::Value,
172 /// The wrapped content.
173 content: Box<Self>,
174 },
175 /// Right-associative precedence wrapper.
176 #[serde(rename = "PREC_RIGHT")]
177 PrecRight {
178 /// Precedence value. Ignored at emit time.
179 #[allow(dead_code)]
180 value: serde_json::Value,
181 /// The wrapped content.
182 content: Box<Self>,
183 },
184 /// Dynamic precedence wrapper.
185 #[serde(rename = "PREC_DYNAMIC")]
186 PrecDynamic {
187 /// Precedence value. Ignored at emit time.
188 #[allow(dead_code)]
189 value: serde_json::Value,
190 /// The wrapped content.
191 content: Box<Self>,
192 },
193 /// Reserved-word wrapper (tree-sitter ≥ 0.25).
194 ///
195 /// Tree-sitter's `RESERVED` rule marks an inner production as a
196 /// reserved-word context: the parser excludes the listed identifiers
197 /// from being treated as the inner symbol. The `context_name`
198 /// metadata names the reserved-word set; the emitter does not need
199 /// it (we are walking schema → bytes, not enforcing reserved-word
200 /// constraints), so we emit the inner content unchanged, the same
201 /// way [`Production::Token`] and [`Production::ImmediateToken`] do.
202 #[serde(rename = "RESERVED")]
203 Reserved {
204 /// The wrapped content.
205 content: Box<Self>,
206 /// Name of the reserved-word context. Ignored at emit time.
207 #[allow(dead_code)]
208 #[serde(default)]
209 context_name: String,
210 },
211}
212
213/// Structural role of a STRING token within a grammar rule.
214///
215/// Derived at Grammar construction time from the token's position in
216/// the production rule body. The role determines spacing behavior in
217/// the layout pass via a role-pair lookup table.
218#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
219pub enum TokenRole {
220 /// First STRING in a matched-pair SEQ (e.g. `(`, `[`, `{`, `<`,
221 /// `begin`, `${`, `⟨`). No space after.
222 BracketOpen,
223 /// Last STRING in a matched-pair SEQ (e.g. `)`, `]`, `}`, `>`,
224 /// `end`, `⟩`). No space before.
225 BracketClose,
226 /// First STRING in a REPEAT body's inner SEQ (e.g. `,` in
227 /// `REPEAT(SEQ [",", item])`). No space before, space after.
228 Separator,
229 /// Alphanumeric STRING that is a language keyword (e.g. `if`,
230 /// `while`, `and`, `model`). Space before and after.
231 Keyword,
232 /// Non-alphanumeric STRING between content members inside a CHOICE
233 /// alternative (e.g. `+`, `=`, `~`, `<-` in binary expression
234 /// alternatives). Space before and after.
235 Operator,
236 /// Non-alphanumeric STRING between content members in a standalone
237 /// SEQ (not inside a CHOICE). Examples: `.` in `attribute`,
238 /// `::` in `scoped_identifier`, `->` in `pointer_member`. These
239 /// are structural connectors, not algebraic operators. No space.
240 Connector,
241 /// Text from a leaf vertex's `literal-value` constraint.
242 Terminal,
243 /// A token the grammar wraps in `IMMEDIATE_TOKEN`: the lexer emits it
244 /// glued to its neighbour with no intervening whitespace (the `.` in
245 /// a float literal `0.5`, an immediate string delimiter). Tight on
246 /// both sides, unconditionally. Mirrors
247 /// [`panproto_gat::LayoutRole::Immediate`].
248 Immediate,
249}
250
251/// A grammar's production-rule table, deserialized from `grammar.json`.
252///
253/// Only the fields the emitter consumes are decoded; precedences,
254/// conflicts, externals, and other parser-only metadata are ignored.
255#[derive(Debug, Clone, Deserialize)]
256#[non_exhaustive]
257pub struct Grammar {
258 /// Grammar name (e.g. `"rust"`, `"typescript"`).
259 #[allow(dead_code)]
260 pub name: String,
261 /// The grammar's start symbol: the first rule as written in
262 /// grammar.json (tree-sitter's entry point). Recovered from the raw
263 /// bytes because [`rules`](Self::rules) is a `BTreeMap` that loses the
264 /// original insertion order.
265 #[serde(skip)]
266 pub start_symbol: String,
267 /// Cached least-fixpoint of the per-hidden-rule minimum mandated child
268 /// count (see `rule_min_required_children`). The table depends only on the
269 /// grammar, so it is resolved once on first use and reused: recomputing the
270 /// whole-grammar fixpoint on every emit-dispatch decision was an O(rules)
271 /// tax per decision, and on yaml (202 rules in one mutually-recursive SCC)
272 /// it dominated emit time. `OnceLock` gives interior-mutability caching
273 /// behind `&Grammar`; `#[serde(skip)]` leaves it empty on deserialize so it
274 /// fills lazily regardless of construction path.
275 #[serde(skip)]
276 pub(crate) min_children: std::sync::OnceLock<std::collections::HashMap<String, usize>>,
277 /// Map from rule name (a vertex kind on the schema side) to
278 /// production. Entries are kept in lexical order so iteration
279 /// is deterministic.
280 pub rules: BTreeMap<String, Production>,
281 /// Supertypes declared in the grammar's `supertypes` field. A
282 /// supertype is a rule whose body is a `CHOICE` of `SYMBOL`
283 /// references; tree-sitter parsers report a node's kind as one
284 /// of the subtypes (e.g. `identifier`, `typed_parameter`) rather
285 /// than the supertype name (`parameter`), so the emitter needs to
286 /// know that a child kind in a subtype set should match the
287 /// supertype name when a SYMBOL references it.
288 #[serde(default, deserialize_with = "deserialize_supertypes")]
289 pub supertypes: std::collections::HashSet<String>,
290 /// Tree-sitter `extras` rules: the named symbols (typically comments)
291 /// that tree-sitter skips at parse time but records as children of the
292 /// surrounding vertex. They appear nowhere in the production grammar,
293 /// so the rule walker cannot reconcile them against the cursor — the
294 /// emit pass therefore drains them as a side channel: at vertex entry
295 /// and between REPEAT iterations any leading extras-kind edges are
296 /// consumed and emitted directly. The set is populated at
297 /// `Grammar::from_bytes` by collecting every `SYMBOL { name }` and
298 /// named `ALIAS { value, named: true }` under the top-level `extras`
299 /// array. Pattern-only extras (e.g. `\s` whitespace) are not vertex
300 /// kinds and are excluded.
301 #[serde(default, deserialize_with = "deserialize_extras")]
302 pub extras: std::collections::HashSet<String>,
303 /// Tree-sitter `inline` rules: named rules the generator splices into
304 /// every referencing production rather than emitting as their own
305 /// node. An inlined rule's children (its FIELDs and bare SYMBOL
306 /// members) are promoted to be direct children of the referencing
307 /// vertex, so on the schema side there is no child vertex of the
308 /// inlined rule's kind. When the emit walk hits a `SYMBOL` member
309 /// naming an inlined rule it must therefore expand that rule's body
310 /// inline against the current cursor (the same treatment a hidden
311 /// `_`-prefixed rule gets), or the inlined members' edges are dropped
312 /// (brightscript `sub_impl`/`function_impl` drop `parameters`/`body`/
313 /// `end_statement`). Populated from grammar.json's top-level `inline`
314 /// array.
315 #[serde(
316 rename = "inline",
317 default,
318 deserialize_with = "deserialize_supertypes"
319 )]
320 pub inline_rules: std::collections::HashSet<String>,
321 /// Precomputed subtyping closure: `subtypes[symbol_name]` is the
322 /// set of vertex kinds that satisfy a SYMBOL `symbol_name`
323 /// reference on the schema side.
324 ///
325 /// Built once at [`Grammar::from_bytes`] time by walking each
326 /// hidden rule (`_`-prefixed), declared supertype, and named
327 /// `ALIAS { value: K, ... }` production to its leaf SYMBOLs and
328 /// recording the closure. This replaces the prior heuristic
329 /// `kind_satisfies_symbol` that walked the rule body on every
330 /// query: lookups are now O(1) and the relation is exactly the
331 /// transitive closure of "is reachable via hidden / supertype /
332 /// alias dispatch", with no over-expansion through non-hidden
333 /// non-supertype rule references.
334 #[serde(skip)]
335 pub subtypes: std::collections::HashMap<String, std::collections::HashSet<String>>,
336 /// Precomputed Yield sets: `yield_sets[rule_name]` is the set of
337 /// concrete vertex kinds that can appear as the **first named
338 /// child** when that rule's production is taken.
339 ///
340 /// Defined inductively:
341 /// - `Yield(SYMBOL S)` where S is hidden/supertype = `Yield(rules[S])`
342 /// - `Yield(SYMBOL S)` where S is concrete = `{S}`
343 /// - `Yield(SEQ [M1, ...])` = `Yield(M1)` (only first member)
344 /// - `Yield(CHOICE [M1, ..., Mn])` = `⋃ Yield(Mi)`
345 /// - `Yield(OPTIONAL { c })` = `Yield(c) ∪ {ε}`
346 /// - `Yield(BLANK)` = `{ε}`
347 /// - Wrappers (PREC*, TOKEN, FIELD, REPEAT, etc.) = `Yield(content)`
348 /// - `Yield(STRING)` = `Yield(PATTERN)` = `∅`
349 /// - `Yield(ALIAS { value: V, named: true })` = `{V}`
350 ///
351 /// Epsilon is represented as the empty string `""`.
352 #[serde(skip)]
353 pub yield_sets: std::collections::HashMap<String, std::collections::HashSet<String>>,
354 /// Child kinds allowed per parent kind, derived from node-types.json.
355 /// Maps parent kind to the set of ALL named child kinds that tree-sitter's
356 /// parser can produce for that parent (from both `children.types` and
357 /// `fields.*.types`). Used by `augment_subtypes_from_node_types` to
358 /// close the grammar/parser divergence gap.
359 #[serde(skip)]
360 pub node_type_children: std::collections::HashMap<String, std::collections::HashSet<String>>,
361 /// Per-field child kinds from node-types.json: maps parent kind →
362 /// field name → set of child kinds. Used by the augmentation to
363 /// restrict subtype edges to structurally matching positions.
364 #[serde(skip)]
365 pub node_type_field_children: std::collections::HashMap<
366 String,
367 std::collections::HashMap<String, std::collections::HashSet<String>>,
368 >,
369 /// Non-field child kinds from node-types.json: maps parent kind →
370 /// set of child kinds that appear in `children.types` (not in any field).
371 #[serde(skip)]
372 pub node_type_nonfield_children:
373 std::collections::HashMap<String, std::collections::HashSet<String>>,
374 /// Anonymous ALIAS values for external scanner tokens. Maps external
375 /// symbol name (e.g. `_ternary_qmark`) to the ALIAS value string
376 /// (e.g. `"?"`). Built by scanning grammar.json rule bodies for
377 /// `ALIAS { content: SYMBOL S, named: false, value: V }` where S
378 /// has no grammar rule.
379 #[serde(skip)]
380 pub external_alias_map: std::collections::HashMap<String, String>,
381 /// Per-rule token role classification. Maps rule name to a map of
382 /// STRING value to its structural role in that rule. Derived at
383 /// construction time by analyzing each rule's SEQ structure to
384 /// identify bracket pairs, separators, keywords, and operators.
385 #[serde(skip)]
386 pub token_roles:
387 std::collections::HashMap<String, std::collections::HashMap<String, TokenRole>>,
388 /// Set of `(rule_name, open_bracket_value)` pairs where the bracket
389 /// triggers indentation (the content between open and close contains
390 /// `REPEAT`/`REPEAT1`). Block-level constructs like `statement_block`
391 /// use indenting brackets; inline constructs like interpolation do not.
392 #[serde(skip)]
393 pub indent_triggers: std::collections::HashSet<(String, String)>,
394 /// Line-comment prefixes extracted from the grammar's extras.
395 /// Each prefix is a STRING value from a `TOKEN(SEQ [STRING prefix,
396 /// PATTERN ...])` pattern in the extras array, verified to be an
397 /// extras rule. Used by the layout pass to insert a newline after
398 /// comment Lit tokens.
399 #[serde(skip)]
400 pub line_comment_prefixes: Vec<String>,
401 /// Bare literal markers that, when emitted as the final token of the
402 /// output, must NOT be followed by the customary end-of-output newline.
403 ///
404 /// Derived from productions of the shape `SEQ[CHOICE[.. bare lit ..],
405 /// <newline-leading>]` — tree-sitter's "hard line break" idiom
406 /// (`markdown_inline`'s `hard_line_break = SEQ[CHOICE["\\" |
407 /// _whitespace_ge_2], _soft_line_break]`). A trailing backslash (or
408 /// trailing whitespace, see [`trailing_break_on_whitespace`]) is plain
409 /// content on its own; only a following newline turns it into a
410 /// line-break node. The end-of-output newline the layout fold appends
411 /// would therefore manufacture a phantom break node on re-parse, so it
412 /// is suppressed when the output ends with one of these markers.
413 ///
414 /// Restricted to SINGLE-character non-alphanumeric literals so the rule
415 /// fires only on genuine standalone break markers (`\`), never on
416 /// keyword/identifier-led line constructs (`posting`, `declaration`,
417 /// `go_directive`) whose leading literal is substantive content.
418 ///
419 /// [`trailing_break_on_whitespace`]: Self::trailing_break_on_whitespace
420 #[serde(skip)]
421 pub trailing_break_markers: Vec<String>,
422 /// Whether the grammar has a hard-line-break production whose leading
423 /// alternative is a whitespace-only PATTERN (`markdown_inline`'s
424 /// `_whitespace_ge_2`). When set, a final emitted token ending in
425 /// trailing spaces/tabs also suppresses the end-of-output newline.
426 #[serde(skip)]
427 pub trailing_break_on_whitespace: bool,
428 /// Whether the grammar's top-level document repeat directly admits a
429 /// free-text content node whose pattern matches a bare newline
430 /// (template / markup grammars: `liquid`'s `template_content =
431 /// REPEAT1([^{]+ | ...)`, `twig`'s `content`, `eex`'s `text`). For such
432 /// grammars a lone trailing newline appended at end of output is
433 /// captured as an extra content node on re-parse, inflating the
434 /// kind-multiset, so the end-of-output newline is suppressed.
435 ///
436 /// Derived narrowly: the content rule must be a DIRECT child of the
437 /// start symbol's top-level REPEAT (through hidden symbols / CHOICE),
438 /// so the rule fires only on genuine document text, never on the
439 /// newline-admitting negated classes inside comments or string
440 /// fragments (which are nested under delimiters, not document nodes).
441 #[serde(skip)]
442 pub top_level_text_admits_newline: bool,
443 /// External tokens that produce indent-open layout actions.
444 /// Identified by tree-sitter naming convention: names ending with
445 /// `_indent` or equal to `_indent`.
446 #[serde(skip)]
447 pub external_indent_opens: std::collections::HashSet<String>,
448 /// External tokens that produce indent-close layout actions.
449 #[serde(skip)]
450 pub external_indent_closes: std::collections::HashSet<String>,
451 /// External tokens that produce line breaks.
452 #[serde(skip)]
453 pub external_newlines: std::collections::HashSet<String>,
454 /// External tokens equivalent to semicolons.
455 #[serde(skip)]
456 pub external_semicolons: std::collections::HashSet<String>,
457 /// External scanner tokens that open a delimiter pair around content
458 /// (e.g. `string_start` in `SEQ[string_start, REPEAT(content),
459 /// string_end]`). Derived structurally; emitted tight on the inside
460 /// (`'hello'`, not `' hello '`).
461 #[serde(skip)]
462 pub external_bracket_opens: std::collections::HashSet<String>,
463 /// External scanner tokens that close a delimiter pair around content
464 /// (e.g. `string_end`). Emitted tight on the inside.
465 #[serde(skip)]
466 pub external_bracket_closes: std::collections::HashSet<String>,
467 /// Visible (non-`_`-prefixed) external scanner tokens that are the
468 /// captured *content* between a pair of string/heredoc delimiters in a
469 /// `SEQ[open_ext, REPEAT(content..), close_ext]` rule (ruby
470 /// `string_content` / `heredoc_content`, regex content, command-string
471 /// content). Such a token's text IS the literal source bytes between the
472 /// delimiters: the layout pass must NOT insert a sibling-separation
473 /// space around it (`"bar"`, not `" bar "`), or a space folds into the
474 /// captured text on re-parse and accretes one space per emit. Derived
475 /// structurally from the same delimiter shape as
476 /// [`external_bracket_opens`](Self::external_bracket_opens).
477 #[serde(skip)]
478 pub external_content_kinds: std::collections::HashSet<String>,
479 /// Named content kinds that sit *between a matched pair of quote
480 /// delimiters* spelled as literal `STRING` tokens, in a rule shaped
481 /// `SEQ[STRING q, REPEAT(CHOICE[content..]), STRING q]` (the same
482 /// quote opens and closes). The CSS `string_value` and the C# / Java
483 /// `string_literal` are the canonical cases: the body is a `REPEAT`
484 /// over `CHOICE[string_content (an ALIAS over a PATTERN),
485 /// escape_sequence]`. Each such content / escape leaf carries the
486 /// verbatim source bytes and must emit *tight* on both sides
487 /// (`"ab\t"`, not `"ab \t "`), exactly like
488 /// [`external_content_kinds`](Self::external_content_kinds) but for the
489 /// STRING-delimited (rather than external-delimited) string shape that
490 /// `classify_external_bracket_delimiters` skips (it only matches
491 /// *external* delimiters). Derived purely from grammar structure (the
492 /// matched-literal-quote envelope), so it stays in the generic emitter.
493 #[serde(skip)]
494 pub string_content_kinds: std::collections::HashSet<String>,
495 /// Rule names that are indented blocks whose opening `_indent` lives
496 /// in a (hidden) parent rule rather than the rule itself: their body
497 /// references an external indent-*close* token (`_dedent`) but no
498 /// indent-*open* token. The parser reaches such a block vertex
499 /// directly (the hidden `_suite` wrapper carrying the `_indent` is
500 /// not a vertex), so the emitter must synthesize the opening indent
501 /// (`def f():` then an indented body) when it walks the rule.
502 #[serde(skip)]
503 pub synthetic_indent_rules: std::collections::HashSet<String>,
504 /// Named alias map: maps alias value to source symbol name.
505 /// When a vertex kind has no direct grammar rule, this map resolves
506 /// `ALIAS { content: SYMBOL source, named: true, value: alias }` so
507 /// the emitter can walk the source rule with proper token roles.
508 #[serde(skip)]
509 pub named_alias_map: std::collections::HashMap<String, String>,
510 /// Every source rule that aliases to a given kind, in grammar order.
511 /// A kind can be the `value` of several distinct `ALIAS` sites (cpp
512 /// `function_definition` is the alias value of `inline_method_definition`,
513 /// `constructor_or_destructor_definition`, `operator_cast_definition`, …,
514 /// AND has its own `function_definition` rule). When the vertex's own
515 /// rule cannot consume one of its child edges (a parser.c/grammar.json
516 /// desync where the collapsed-kind rule omits constructor-only members
517 /// like `field_initializer_list`), the emitter falls back to the alias
518 /// source whose production *does* admit the child set.
519 #[serde(skip)]
520 pub named_alias_sources: std::collections::HashMap<String, Vec<String>>,
521 /// Named terminal kinds whose underlying `PATTERN` can match a leading
522 /// space (e.g. INI's `setting_value = PATTERN ".+"`). A layout space
523 /// emitted *before* such a terminal would fold into its captured text
524 /// on re-parse and accrete one space per emit, so the emitter hugs them
525 /// to their predecessor. See `pattern_absorbs_leading_space`.
526 #[serde(skip)]
527 pub leading_space_terminals: std::collections::HashSet<String>,
528 /// Named terminal kinds whose underlying `PATTERN` runs to the end of
529 /// the source line (an unbounded trailing `.*` / `.+`, e.g. JS's
530 /// `hash_bang_line = #!.*`). Like a line comment, such a token absorbs
531 /// any text that follows it on the same line, so the layout pass emits a
532 /// newline after it: otherwise the next sibling re-parses as part of the
533 /// token. See `is_rest_of_line_pattern`.
534 #[serde(skip)]
535 pub line_rest_kinds: std::collections::HashSet<String>,
536 /// Named alias values whose ALIAS content reduces to an `IMMEDIATE_TOKEN`
537 /// (e.g. C's `char_literal` body `ALIAS{IMMEDIATE_TOKEN PATTERN "[^\n']",
538 /// value: "character"}`). The lexer admits such a token only with no
539 /// preceding whitespace, so the emitter hugs it to its predecessor: the
540 /// alias-value carries no grammar rule, so the rule-head `IMMEDIATE_TOKEN`
541 /// no-space check in `emit_vertex` never fires for it. Emitting these
542 /// leaves tight keeps `'hey'` from re-spacing to `' h e y'` (whose spaces
543 /// re-parse as extra `character` nodes).
544 #[serde(skip)]
545 pub immediate_token_alias_kinds: std::collections::HashSet<String>,
546 /// Text to emit for an external *closing* delimiter whose matching
547 /// *opener* is a literal `STRING` (a rule shaped `SEQ[STRING q, body..,
548 /// EXTERNAL close]`, the asymmetric twin of the all-external and
549 /// all-STRING delimiter shapes). TOML's `_multiline_basic_string =
550 /// SEQ[STRING """, REPEAT(..), _multiline_basic_string_end]` is the
551 /// canonical case: the open `"""` is a grammar literal, but the close is
552 /// a scanner external with no rule and no resolvable text, so the
553 /// emitter would drop it and leave the string unterminated. A multiline
554 /// string closes with the same delimiter it opens with, so the external
555 /// close emits the opener's literal. Derived purely from grammar
556 /// structure (the STRING-open / external-close envelope); stays in the
557 /// generic emitter.
558 #[serde(skip)]
559 pub external_close_text: std::collections::HashMap<String, String>,
560}
561
562pub(crate) fn deserialize_supertypes<'de, D>(
563 deserializer: D,
564) -> Result<std::collections::HashSet<String>, D::Error>
565where
566 D: serde::Deserializer<'de>,
567{
568 let entries: Vec<serde_json::Value> = Vec::deserialize(deserializer)?;
569 let mut out = std::collections::HashSet::new();
570 for entry in entries {
571 match entry {
572 serde_json::Value::String(s) => {
573 out.insert(s);
574 }
575 serde_json::Value::Object(map) => {
576 if let Some(serde_json::Value::String(name)) = map.get("name") {
577 out.insert(name.clone());
578 }
579 }
580 _ => {}
581 }
582 }
583 Ok(out)
584}
585
586pub(crate) fn deserialize_extras<'de, D>(
587 deserializer: D,
588) -> Result<std::collections::HashSet<String>, D::Error>
589where
590 D: serde::Deserializer<'de>,
591{
592 let entries: Vec<serde_json::Value> = Vec::deserialize(deserializer)?;
593 let mut out = std::collections::HashSet::new();
594 for entry in entries {
595 if let serde_json::Value::Object(map) = entry {
596 let ty = map.get("type").and_then(serde_json::Value::as_str);
597 match ty {
598 // SYMBOL { name: K } — the extras rule is a named symbol
599 // (typically `line_comment` / `block_comment`). The kind
600 // K appears as a real child vertex on the schema side.
601 Some("SYMBOL") => {
602 if let Some(serde_json::Value::String(name)) = map.get("name") {
603 out.insert(name.clone());
604 }
605 }
606 // ALIAS { content, value: V, named: true } — the extras
607 // rule renames its content; V is the kind on the schema.
608 Some("ALIAS") => {
609 let named = map
610 .get("named")
611 .and_then(serde_json::Value::as_bool)
612 .unwrap_or(false);
613 if named {
614 if let Some(serde_json::Value::String(value)) = map.get("value") {
615 out.insert(value.clone());
616 }
617 }
618 }
619 // PATTERN / STRING / TOKEN entries describe inter-token
620 // whitespace and have no vertex-side representation.
621 _ => {}
622 }
623 }
624 }
625 Ok(out)
626}
627
628impl Grammar {
629 /// Parse a grammar's `grammar.json` bytes.
630 ///
631 /// Builds the subtyping closure as part of construction so every
632 /// downstream lookup is O(1). The closure is the least relation
633 /// containing `(K, K)` for every rule key `K` and closed under:
634 ///
635 /// - hidden-rule expansion: if `S` is hidden and a SYMBOL `S` may
636 /// reach SYMBOL `K`, then `K ⊑ S`.
637 /// - supertype expansion: if `S` is in the grammar's supertypes
638 /// block and `K` is one of `S`'s alternatives, then `K ⊑ S`.
639 /// - alias renaming: if a rule body contains
640 /// `ALIAS { content: SYMBOL R, value: A, named: true }` where
641 /// `R` reaches kind `K` (or `K = R` when no further hop), then
642 /// `A ⊑ R` and `K ⊑ A`.
643 ///
644 /// # Errors
645 ///
646 /// Returns [`ParseError::EmitFailed`] when the bytes are not a
647 /// valid `grammar.json` document.
648 pub fn from_bytes(protocol: &str, bytes: &[u8]) -> Result<Self, ParseError> {
649 Self::from_bytes_with_node_types(protocol, bytes, None)
650 }
651
652 /// Parse a grammar from both `grammar.json` and optionally
653 /// `node-types.json` bytes.
654 ///
655 /// # Errors
656 ///
657 /// Returns [`ParseError::EmitFailed`] when `grammar_bytes` is
658 /// not a valid `grammar.json` document.
659 pub fn from_bytes_with_node_types(
660 protocol: &str,
661 grammar_bytes: &[u8],
662 node_types_bytes: Option<&[u8]>,
663 ) -> Result<Self, ParseError> {
664 let mut grammar: Self =
665 serde_json::from_slice(grammar_bytes).map_err(|e| ParseError::EmitFailed {
666 protocol: protocol.to_owned(),
667 reason: format!("grammar.json deserialization failed: {e}"),
668 })?;
669 // The `rules` BTreeMap loses grammar.json's insertion order, but
670 // tree-sitter's START SYMBOL is the FIRST rule as written. Recover
671 // it from the raw bytes so precomputes keyed on the start symbol
672 // (top-level document text) use the right entry point.
673 grammar.start_symbol = extract_start_symbol(grammar_bytes);
674 grammar.subtypes = compute_subtype_closure(&grammar);
675 grammar.named_alias_map = build_named_alias_map(&grammar);
676 grammar.named_alias_sources = build_named_alias_sources(&grammar);
677 grammar.yield_sets = compute_yield_sets(&grammar);
678 if let Some(nt_bytes) = node_types_bytes {
679 let (all_children, field_children, nonfield_children) =
680 build_node_type_children(nt_bytes);
681 grammar.node_type_children = all_children;
682 grammar.node_type_field_children = field_children;
683 grammar.node_type_nonfield_children = nonfield_children;
684 // Repair grammar.json/parser.c FIELD-name drift before any
685 // field-driven precompute or augmentation reads the rule bodies,
686 // so the corrected names flow everywhere downstream.
687 reconcile_field_names(&mut grammar);
688 augment_subtypes_from_node_types(&mut grammar);
689 }
690 grammar.yield_sets = compute_yield_sets(&grammar);
691 grammar.external_alias_map = build_external_alias_map(&grammar);
692 let (token_roles, indent_triggers) = compute_token_roles(&grammar);
693 grammar.token_roles = token_roles;
694 grammar.indent_triggers = indent_triggers;
695 grammar.line_comment_prefixes = extract_line_comment_prefixes(&grammar);
696 let (tb_markers, tb_ws) = classify_trailing_break_markers(&grammar);
697 grammar.trailing_break_markers = tb_markers;
698 grammar.trailing_break_on_whitespace = tb_ws;
699 grammar.top_level_text_admits_newline = classify_top_level_text_admits_newline(&grammar);
700 classify_external_layout_tokens(&mut grammar);
701 classify_external_bracket_delimiters(&mut grammar);
702 classify_external_close_text(&mut grammar);
703 classify_string_content_kinds(&mut grammar);
704 classify_synthetic_indent_rules(&mut grammar);
705 grammar.leading_space_terminals = classify_leading_space_terminals(&grammar);
706 grammar.line_rest_kinds = classify_line_rest_kinds(&grammar);
707 grammar.immediate_token_alias_kinds = classify_immediate_token_alias_kinds(&grammar);
708 grammar.yield_sets = compute_yield_sets(&grammar);
709 Ok(grammar)
710 }
711}
712
713/// Compute the subtyping relation as a forward-indexed map from a
714/// SYMBOL name to the set of vertex kinds that satisfy that SYMBOL.
715pub(crate) fn compute_subtype_closure(
716 grammar: &Grammar,
717) -> std::collections::HashMap<String, std::collections::HashSet<String>> {
718 use std::collections::{HashMap, HashSet};
719 // Edges of the "kind X satisfies SYMBOL Y" relation. `K ⊑ Y` is
720 // recorded whenever Y is reached by walking the grammar's
721 // ALIAS / hidden-rule / supertype dispatch from a position where
722 // K is the actual vertex kind.
723 let mut subtypes: HashMap<String, HashSet<String>> = HashMap::new();
724 for name in grammar.rules.keys() {
725 subtypes
726 .entry(name.clone())
727 .or_default()
728 .insert(name.clone());
729 }
730
731 // First pass: collect the immediate "satisfies" edges from each
732 // expandable rule (hidden, supertype) to the kinds reachable by
733 // walking its body, plus alias edges.
734 fn walk<'g>(
735 grammar: &'g Grammar,
736 production: &'g Production,
737 visited: &mut HashSet<&'g str>,
738 out: &mut HashSet<String>,
739 ) {
740 match production {
741 Production::Symbol { name } => {
742 // Direct subtype.
743 out.insert(name.clone());
744 // Continue expansion through hidden / supertype rules
745 // so the closure traverses pass-through dispatch.
746 let expand = name.starts_with('_') || grammar.supertypes.contains(name.as_str());
747 if expand && visited.insert(name.as_str()) {
748 if let Some(rule) = grammar.rules.get(name) {
749 walk(grammar, rule, visited, out);
750 }
751 }
752 }
753 Production::Choice { members } => {
754 for m in members {
755 walk(grammar, m, visited, out);
756 }
757 }
758 Production::Alias {
759 content,
760 named,
761 value,
762 } => {
763 if *named && !value.is_empty() {
764 out.insert(value.clone());
765 }
766 walk(grammar, content, visited, out);
767 }
768 Production::Repeat { content }
769 | Production::Repeat1 { content }
770 | Production::Optional { content }
771 | Production::Field { content, .. }
772 | Production::Token { content }
773 | Production::ImmediateToken { content }
774 | Production::Prec { content, .. }
775 | Production::PrecLeft { content, .. }
776 | Production::PrecRight { content, .. }
777 | Production::PrecDynamic { content, .. }
778 | Production::Reserved { content, .. } => {
779 walk(grammar, content, visited, out);
780 }
781 _ => {}
782 }
783 }
784
785 for (name, rule) in &grammar.rules {
786 let expand = name.starts_with('_') || grammar.supertypes.contains(name.as_str());
787 if !expand {
788 continue;
789 }
790 let mut visited: HashSet<&str> = HashSet::new();
791 visited.insert(name.as_str());
792 let mut reachable: HashSet<String> = HashSet::new();
793 walk(grammar, rule, &mut visited, &mut reachable);
794 for kind in &reachable {
795 subtypes
796 .entry(kind.clone())
797 .or_default()
798 .insert(name.clone());
799 }
800 }
801
802 // Aliases: scan every rule body for ALIAS { content, value }
803 // declarations. The kinds reachable from `content` satisfy
804 // `value`, AND (by construction) `value` satisfies the
805 // surrounding rule. Walking the ENTIRE grammar once captures
806 // every alias site, irrespective of which rule introduces it.
807 fn collect_aliases<'g>(production: &'g Production, out: &mut Vec<(String, &'g Production)>) {
808 match production {
809 Production::Alias {
810 content,
811 named,
812 value,
813 } => {
814 if *named && !value.is_empty() {
815 out.push((value.clone(), content.as_ref()));
816 }
817 collect_aliases(content, out);
818 }
819 Production::Choice { members } | Production::Seq { members } => {
820 for m in members {
821 collect_aliases(m, out);
822 }
823 }
824 Production::Repeat { content }
825 | Production::Repeat1 { content }
826 | Production::Optional { content }
827 | Production::Field { content, .. }
828 | Production::Token { content }
829 | Production::ImmediateToken { content }
830 | Production::Prec { content, .. }
831 | Production::PrecLeft { content, .. }
832 | Production::PrecRight { content, .. }
833 | Production::PrecDynamic { content, .. }
834 | Production::Reserved { content, .. } => {
835 collect_aliases(content, out);
836 }
837 _ => {}
838 }
839 }
840 let mut aliases: Vec<(String, &Production)> = Vec::new();
841 for rule in grammar.rules.values() {
842 collect_aliases(rule, &mut aliases);
843 }
844 for (alias_value, content) in aliases {
845 let mut visited: HashSet<&str> = HashSet::new();
846 let mut reachable: HashSet<String> = HashSet::new();
847 walk(grammar, content, &mut visited, &mut reachable);
848 // Aliased value satisfies itself and is satisfied by every
849 // kind its content can reach.
850 subtypes
851 .entry(alias_value.clone())
852 .or_default()
853 .insert(alias_value.clone());
854 for kind in reachable {
855 subtypes
856 .entry(kind)
857 .or_default()
858 .insert(alias_value.clone());
859 }
860 }
861
862 // Transitive close through hidden and supertype rules via Tarjan SCC.
863 //
864 // The relation `K ⊑ Y` means "a vertex of kind K can appear where
865 // the grammar says SYMBOL Y." Transitivity applies when Y is a
866 // hidden or supertype rule (a dispatch point), NOT when Y is a
867 // concrete named rule. We build the directed graph G on dispatchable
868 // node names with edge Y → Z iff Z ∈ subtypes[Y] and Z is dispatchable.
869 // The transitive closure within G is the union of every reachable
870 // dispatchable node, which by Tarjan's theorem is computed in
871 // O(V + E) by contracting SCCs into a DAG, then unioning closures
872 // along reverse topological order.
873 let is_dispatch = |s: &str| s.starts_with('_') || grammar.supertypes.contains(s);
874 // 1. Nodes: every dispatchable name that appears as a key in subtypes
875 // OR as a member of any subtypes value.
876 let mut nodes: HashSet<String> = HashSet::new();
877 for (k, vs) in &subtypes {
878 if is_dispatch(k) {
879 nodes.insert(k.clone());
880 }
881 for v in vs {
882 if is_dispatch(v) {
883 nodes.insert(v.clone());
884 }
885 }
886 }
887 let nodes: Vec<String> = nodes.into_iter().collect();
888 let index_of: HashMap<&str, usize> = nodes
889 .iter()
890 .enumerate()
891 .map(|(i, n)| (n.as_str(), i))
892 .collect();
893 // 2. Edges: Y → Z iff Z ∈ subtypes[Y] and both are dispatchable.
894 let mut edges: Vec<Vec<usize>> = vec![Vec::new(); nodes.len()];
895 for (i, name) in nodes.iter().enumerate() {
896 if let Some(targets) = subtypes.get(name) {
897 for t in targets {
898 if let Some(&j) = index_of.get(t.as_str()) {
899 if i != j {
900 edges[i].push(j);
901 }
902 }
903 }
904 }
905 }
906
907 // 3. Tarjan SCC. `comp[v]` = SCC index of `v`. SCC indices come out
908 // in reverse topological order (sinks first), which is exactly
909 // the order we want for closure accumulation.
910 fn tarjan(edges: &[Vec<usize>]) -> Vec<usize> {
911 let n = edges.len();
912 let mut comp = vec![usize::MAX; n];
913 let mut index_arr = vec![usize::MAX; n];
914 let mut lowlink = vec![0usize; n];
915 let mut on_stack = vec![false; n];
916 let mut stack: Vec<usize> = Vec::new();
917 let mut next_index = 0usize;
918 let mut next_comp = 0usize;
919 // Iterative Tarjan to avoid stack overflow on large grammars.
920 let mut work: Vec<(usize, usize)> = Vec::new();
921 for start in 0..n {
922 if index_arr[start] != usize::MAX {
923 continue;
924 }
925 work.push((start, 0));
926 index_arr[start] = next_index;
927 lowlink[start] = next_index;
928 next_index += 1;
929 stack.push(start);
930 on_stack[start] = true;
931 while let Some(&(v, i)) = work.last() {
932 if i < edges[v].len() {
933 let w = edges[v][i];
934 if let Some(slot) = work.last_mut() {
935 slot.1 += 1;
936 }
937 if index_arr[w] == usize::MAX {
938 index_arr[w] = next_index;
939 lowlink[w] = next_index;
940 next_index += 1;
941 stack.push(w);
942 on_stack[w] = true;
943 work.push((w, 0));
944 } else if on_stack[w] && index_arr[w] < lowlink[v] {
945 lowlink[v] = index_arr[w];
946 }
947 } else {
948 if lowlink[v] == index_arr[v] {
949 while let Some(w) = stack.pop() {
950 on_stack[w] = false;
951 comp[w] = next_comp;
952 if w == v {
953 break;
954 }
955 }
956 next_comp += 1;
957 }
958 let lv = lowlink[v];
959 work.pop();
960 if let Some(&(parent, _)) = work.last() {
961 if lv < lowlink[parent] {
962 lowlink[parent] = lv;
963 }
964 }
965 }
966 }
967 }
968 comp
969 }
970 let comp = tarjan(&edges);
971 let num_comps = comp.iter().max().copied().map_or(0, |m| m + 1);
972
973 // 4. For each SCC, accumulate the set of dispatchable nodes reachable
974 // from it. SCCs are emitted in reverse topological order, so when
975 // we process SCC c, every successor SCC has its closure already
976 // computed.
977 let mut scc_members: Vec<Vec<usize>> = vec![Vec::new(); num_comps];
978 for (v, &c) in comp.iter().enumerate() {
979 scc_members[c].push(v);
980 }
981 let mut scc_closure: Vec<HashSet<String>> = vec![HashSet::new(); num_comps];
982 for c in 0..num_comps {
983 // Members of the SCC are mutually reachable.
984 let mut closure: HashSet<String> = HashSet::new();
985 for &v in &scc_members[c] {
986 closure.insert(nodes[v].clone());
987 }
988 // Successor SCCs' closures (already computed).
989 for &v in &scc_members[c] {
990 for &w in &edges[v] {
991 let wc = comp[w];
992 if wc != c {
993 closure.extend(scc_closure[wc].iter().cloned());
994 }
995 }
996 }
997 scc_closure[c] = closure;
998 }
999
1000 // 5. Apply: for each kind K in `subtypes`, replace its dispatchable
1001 // supertypes by their full closure. Non-dispatchable members
1002 // (concrete kinds) stay as-is.
1003 let keys: Vec<String> = subtypes.keys().cloned().collect();
1004 for k in keys {
1005 let existing = subtypes.remove(&k).unwrap_or_default();
1006 let mut new_set: HashSet<String> = HashSet::new();
1007 for s in &existing {
1008 new_set.insert(s.clone());
1009 if let Some(&i) = index_of.get(s.as_str()) {
1010 new_set.extend(scc_closure[comp[i]].iter().cloned());
1011 }
1012 }
1013 subtypes.insert(k, new_set);
1014 }
1015
1016 subtypes
1017}
1018
1019/// Compute the Yield set for every rule in the grammar.
1020///
1021/// `Yield(P)` is the set of concrete vertex kinds that can appear as
1022/// the first named child when production P is taken. See the
1023/// `Grammar::yield_sets` doc comment for the inductive definition.
1024pub(crate) fn compute_yield_sets(
1025 grammar: &Grammar,
1026) -> std::collections::HashMap<String, std::collections::HashSet<String>> {
1027 let mut cache: std::collections::HashMap<String, std::collections::HashSet<String>> =
1028 std::collections::HashMap::new();
1029 for (name, rule) in &grammar.rules {
1030 let expand = name.starts_with('_') || grammar.supertypes.contains(name.as_str());
1031 if !expand {
1032 continue;
1033 }
1034 if cache.contains_key(name) {
1035 continue;
1036 }
1037 let mut visited = std::collections::HashSet::new();
1038 let ys = yield_of_production(grammar, rule, &mut visited, &mut cache);
1039 cache.insert(name.clone(), ys);
1040 }
1041 cache
1042}
1043
1044/// Compute the Yield set of an arbitrary production node.
1045///
1046/// Uses `cache` (the partially-built `yield_sets` map) as
1047/// memoization. `visited` tracks the current recursion path to
1048/// detect cycles through hidden/supertype rules; a cycle returns ∅
1049/// (a cycle that never passes through a concrete named symbol
1050/// cannot produce a first child).
1051pub(crate) fn yield_of_production(
1052 grammar: &Grammar,
1053 production: &Production,
1054 visited: &mut std::collections::HashSet<String>,
1055 cache: &mut std::collections::HashMap<String, std::collections::HashSet<String>>,
1056) -> std::collections::HashSet<String> {
1057 match production {
1058 Production::Symbol { name } => {
1059 let expand = name.starts_with('_') || grammar.supertypes.contains(name.as_str());
1060 if !expand {
1061 let mut set = std::collections::HashSet::new();
1062 set.insert(name.clone());
1063 return set;
1064 }
1065 if let Some(cached) = cache.get(name) {
1066 return cached.clone();
1067 }
1068 {
1069 if !visited.insert(name.clone()) {
1070 return std::collections::HashSet::new();
1071 }
1072 let result = if let Some(rule) = grammar.rules.get(name) {
1073 yield_of_production(grammar, rule, visited, cache)
1074 } else {
1075 std::collections::HashSet::new()
1076 };
1077 visited.remove(name);
1078 cache.insert(name.clone(), result.clone());
1079 result
1080 }
1081 }
1082 Production::Alias {
1083 content,
1084 named,
1085 value,
1086 } => {
1087 if *named && !value.is_empty() {
1088 let mut set = std::collections::HashSet::new();
1089 set.insert(value.clone());
1090 set
1091 } else {
1092 yield_of_production(grammar, content, visited, cache)
1093 }
1094 }
1095 Production::Seq { members } => {
1096 if members.is_empty() {
1097 let mut set = std::collections::HashSet::new();
1098 set.insert(String::new());
1099 set
1100 } else {
1101 // Walk SEQ members left-to-right. STRING/PATTERN yield ∅
1102 // (anonymous tokens, skipped). Named-child-producing
1103 // members yield a non-empty set. If that set contains ε,
1104 // the member is optional and the next member's yield is
1105 // also reachable. Accumulate until we hit a non-optional
1106 // named-child producer.
1107 let mut combined = std::collections::HashSet::new();
1108 for m in members {
1109 let ys = yield_of_production(grammar, m, visited, cache);
1110 if ys.is_empty() {
1111 continue;
1112 }
1113 let has_epsilon = ys.contains("");
1114 combined.extend(ys);
1115 if !has_epsilon {
1116 break;
1117 }
1118 }
1119 combined
1120 }
1121 }
1122 Production::Choice { members } => {
1123 let mut union = std::collections::HashSet::new();
1124 for m in members {
1125 union.extend(yield_of_production(grammar, m, visited, cache));
1126 }
1127 union
1128 }
1129 Production::Optional { content } => {
1130 let mut set = yield_of_production(grammar, content, visited, cache);
1131 set.insert(String::new());
1132 set
1133 }
1134 Production::Blank => {
1135 let mut set = std::collections::HashSet::new();
1136 set.insert(String::new());
1137 set
1138 }
1139 Production::String { .. } | Production::Pattern { .. } => std::collections::HashSet::new(),
1140 Production::Repeat { content } => {
1141 let mut set = yield_of_production(grammar, content, visited, cache);
1142 set.insert(String::new());
1143 set
1144 }
1145 Production::Repeat1 { content }
1146 | Production::Field { content, .. }
1147 | Production::Token { content }
1148 | Production::ImmediateToken { content }
1149 | Production::Prec { content, .. }
1150 | Production::PrecLeft { content, .. }
1151 | Production::PrecRight { content, .. }
1152 | Production::PrecDynamic { content, .. }
1153 | Production::Reserved { content, .. } => {
1154 yield_of_production(grammar, content, visited, cache)
1155 }
1156 }
1157}
1158
1159// ═══════════════════════════════════════════════════════════════════
1160// node-types.json integration
1161// ═══════════════════════════════════════════════════════════════════
1162
1163/// Parse node-types.json and build a map from parent kind to the set
1164/// of all named child kinds the parser can produce for that parent.
1165pub(crate) type NodeTypeResult = (
1166 std::collections::HashMap<String, std::collections::HashSet<String>>,
1167 std::collections::HashMap<
1168 String,
1169 std::collections::HashMap<String, std::collections::HashSet<String>>,
1170 >,
1171 std::collections::HashMap<String, std::collections::HashSet<String>>,
1172);
1173
1174pub(crate) fn build_node_type_children(nt_bytes: &[u8]) -> NodeTypeResult {
1175 use std::collections::{HashMap, HashSet};
1176 // Use the resilient parser, not a direct `from_slice`: recent
1177 // tree-sitter releases append non-node metadata markers (e.g. erlang's
1178 // `{"@generated": true}`) with no `type` field, which a strict
1179 // `Vec<NodeType>` deserialization rejects — silently zeroing the whole
1180 // node-types child map (and with it FIELD-name reconciliation and the
1181 // subtype augmentation it feeds).
1182 let node_types: Vec<crate::theory_extract::NodeType> =
1183 match crate::theory_extract::parse_node_types(nt_bytes) {
1184 Ok(v) => v,
1185 Err(_) => return (HashMap::new(), HashMap::new(), HashMap::new()),
1186 };
1187 let mut all_map: HashMap<String, HashSet<String>> = HashMap::new();
1188 let mut field_map: HashMap<String, HashMap<String, HashSet<String>>> = HashMap::new();
1189 let mut nonfield_map: HashMap<String, HashSet<String>> = HashMap::new();
1190 for entry in &node_types {
1191 if !entry.named {
1192 continue;
1193 }
1194 let mut child_kinds = HashSet::new();
1195 for (field_name, field_value) in &entry.fields {
1196 if let Some(types) = field_value.get("types").and_then(|t| t.as_array()) {
1197 for t in types {
1198 if let (Some(name), Some(true)) = (
1199 t.get("type").and_then(|n| n.as_str()),
1200 t.get("named").and_then(serde_json::Value::as_bool),
1201 ) {
1202 child_kinds.insert(name.to_owned());
1203 field_map
1204 .entry(entry.node_type.clone())
1205 .or_default()
1206 .entry(field_name.clone())
1207 .or_default()
1208 .insert(name.to_owned());
1209 }
1210 }
1211 }
1212 }
1213 if let Some(ref children) = entry.children {
1214 for t in &children.types {
1215 if t.named {
1216 child_kinds.insert(t.node_type.clone());
1217 nonfield_map
1218 .entry(entry.node_type.clone())
1219 .or_default()
1220 .insert(t.node_type.clone());
1221 }
1222 }
1223 }
1224 if !child_kinds.is_empty() {
1225 all_map.insert(entry.node_type.clone(), child_kinds);
1226 }
1227 }
1228 (all_map, field_map, nonfield_map)
1229}
1230
1231/// Augment `grammar.subtypes` with child-kind data from node-types.json.
1232///
1233/// Uses per-field structural matching: for each parent kind P, each field
1234/// F in P's node-types.json entry, and each child kind C in field F's
1235/// types, find the SYMBOL S referenced at field F's position in P's
1236/// grammar rule. If C lacks a grammar rule and does not already satisfy S,
1237/// record C ⊑ S. Non-field children are matched against non-FIELD symbols
1238/// in the rule body.
1239pub(crate) fn augment_subtypes_from_node_types(grammar: &mut Grammar) {
1240 use std::collections::HashMap;
1241
1242 // Build per-field child-kind map from node-types.json by re-parsing.
1243 let mut pairs: Vec<(String, String)> = Vec::new();
1244 for parent_kind in grammar.node_type_children.keys() {
1245 let Some(rule) = grammar.rules.get(parent_kind) else {
1246 continue;
1247 };
1248
1249 // Collect symbols from the grammar rule, partitioned by the
1250 // FIELD they appear in (or non-field for top-level symbols).
1251 let mut field_symbols: HashMap<String, Vec<String>> = HashMap::new();
1252 let mut non_field_symbols: Vec<String> = Vec::new();
1253 collect_field_symbols(rule, &mut field_symbols, &mut non_field_symbols, false);
1254
1255 // A node-types child kind `K` satisfies a grammar symbol `S` only
1256 // when `S` is a legitimate *dispatch target* that can transparently
1257 // yield `K` (a hidden `_`-rule, a declared supertype, or a rule with
1258 // no literal tokens of its own that inlines under another kind). A
1259 // *concrete self-anchored* rule (own literal tokens / brackets /
1260 // keywords, e.g. cpp `attribute_declaration = SEQ["[[", …, "]]"]`)
1261 // always materialises under its OWN name, so it can never be the
1262 // surface kind of a *different* child. The node-types `children.types`
1263 // list is a flat sibling set: a rule with several distinct non-field
1264 // children (cpp `base_class_clause` = access_specifier + type_identifier
1265 // + attribute_declaration) would otherwise cross-product every child
1266 // kind with every symbol, spuriously making `type_identifier ⊑
1267 // attribute_declaration` and letting a leading REPEAT(attribute_decl)
1268 // steal the `type_identifier` (base-class reorder). Mirror `sat`'s
1269 // non-transitivity: skip self-anchored concrete targets.
1270 let dispatch_target = |grammar: &Grammar, sym: &str| -> bool {
1271 sym.starts_with('_')
1272 || grammar.supertypes.contains(sym)
1273 || grammar
1274 .rules
1275 .get(sym)
1276 .is_none_or(|r| literal_strings(r).is_empty())
1277 };
1278
1279 // Per-field augmentation: for each FIELD F in the grammar rule,
1280 // match child kinds that node-types.json says appear in field F
1281 // against the symbols at field F's position.
1282 if let Some(nt_fields) = grammar.node_type_field_children.get(parent_kind) {
1283 for (field_name, nt_child_kinds) in nt_fields {
1284 let Some(rule_syms) = field_symbols.get(field_name) else {
1285 continue;
1286 };
1287 for child_kind in nt_child_kinds {
1288 if grammar.rules.contains_key(child_kind) {
1289 continue;
1290 }
1291 for sym_name in rule_syms {
1292 if dispatch_target(grammar, sym_name)
1293 && !kind_satisfies_symbol(grammar, Some(child_kind), sym_name)
1294 {
1295 pairs.push((child_kind.clone(), sym_name.clone()));
1296 }
1297 }
1298 }
1299 }
1300 }
1301
1302 // Non-field augmentation: for child kinds from `children.types`
1303 // (no field), match against non-FIELD symbols in the rule.
1304 if let Some(nt_nonfield) = grammar.node_type_nonfield_children.get(parent_kind) {
1305 for child_kind in nt_nonfield {
1306 if grammar.rules.contains_key(child_kind) {
1307 continue;
1308 }
1309 for sym_name in &non_field_symbols {
1310 if dispatch_target(grammar, sym_name)
1311 && !kind_satisfies_symbol(grammar, Some(child_kind), sym_name)
1312 {
1313 pairs.push((child_kind.clone(), sym_name.clone()));
1314 }
1315 }
1316 }
1317 }
1318 }
1319 for (child_kind, sym_name) in pairs {
1320 grammar
1321 .subtypes
1322 .entry(child_kind)
1323 .or_default()
1324 .insert(sym_name);
1325 }
1326}
1327
1328/// Walk a production and collect referenced symbols, separating those
1329/// inside FIELD bodies (keyed by field name) from those outside any FIELD.
1330/// Reconcile grammar.json `FIELD` names against node-types.json when they
1331/// desync. The vendored `grammar.json` and the generated `parser.c`
1332/// (reflected by `node-types.json`) can drift: a `FIELD(exprs, _expr)` in
1333/// `grammar.json` may correspond to a field the parser actually emits as
1334/// `expr` (erlang `list_comprehension`'s template). The schema's child
1335/// edges carry the *parser's* field name, so the emit walker's
1336/// `take_field("exprs")` finds nothing and the child is dropped.
1337///
1338/// `node-types.json` is authoritative for the parser's field names. For
1339/// each rule whose kind appears there, when there is exactly ONE grammar
1340/// `FIELD` name absent from the node-types field set and exactly ONE
1341/// node-types field name absent from the grammar's field set, the grammar
1342/// name is the stale one: rewrite every `FIELD` carrying it to the
1343/// node-types name. The 1-to-1 constraint keeps this to the unambiguous
1344/// rename case (a genuine drift), never a structural reinterpretation.
1345pub(crate) fn reconcile_field_names(grammar: &mut Grammar) {
1346 use std::collections::HashSet;
1347 let mut renames: Vec<(String, String, String)> = Vec::new();
1348 for (kind, nt_fields) in &grammar.node_type_field_children {
1349 let Some(rule) = grammar.rules.get(kind) else {
1350 continue;
1351 };
1352 let mut grammar_fields: HashSet<String> = HashSet::new();
1353 collect_grammar_field_names(rule, &mut grammar_fields);
1354 let nt_names: HashSet<&String> = nt_fields.keys().collect();
1355 let grammar_only: Vec<&String> = grammar_fields
1356 .iter()
1357 .filter(|f| !nt_names.contains(f))
1358 .collect();
1359 let nt_only: Vec<&String> = nt_fields
1360 .keys()
1361 .filter(|f| !grammar_fields.contains(*f))
1362 .collect();
1363 // Exactly one stale grammar field and one unmatched parser field:
1364 // the unambiguous drift. (Zero-diff rules and many-to-many
1365 // reshuffles are left untouched.)
1366 if grammar_only.len() == 1 && nt_only.len() == 1 {
1367 renames.push((kind.clone(), grammar_only[0].clone(), nt_only[0].clone()));
1368 }
1369 }
1370 for (kind, from, to) in renames {
1371 if let Some(rule) = grammar.rules.get_mut(&kind) {
1372 rename_field_in(rule, &from, &to);
1373 }
1374 }
1375}
1376
1377/// Collect the set of `FIELD` names appearing anywhere in `prod`.
1378fn collect_grammar_field_names(prod: &Production, out: &mut std::collections::HashSet<String>) {
1379 match prod {
1380 Production::Field { name, content } => {
1381 out.insert(name.clone());
1382 collect_grammar_field_names(content, out);
1383 }
1384 Production::Choice { members } | Production::Seq { members } => {
1385 for m in members {
1386 collect_grammar_field_names(m, out);
1387 }
1388 }
1389 Production::Repeat { content }
1390 | Production::Repeat1 { content }
1391 | Production::Optional { content }
1392 | Production::Alias { content, .. }
1393 | Production::Token { content }
1394 | Production::ImmediateToken { content }
1395 | Production::Prec { content, .. }
1396 | Production::PrecLeft { content, .. }
1397 | Production::PrecRight { content, .. }
1398 | Production::PrecDynamic { content, .. }
1399 | Production::Reserved { content, .. } => collect_grammar_field_names(content, out),
1400 _ => {}
1401 }
1402}
1403
1404/// Rewrite every `FIELD(from, …)` to `FIELD(to, …)` within `prod`.
1405fn rename_field_in(prod: &mut Production, from: &str, to: &str) {
1406 match prod {
1407 Production::Field { name, content } => {
1408 if name == from {
1409 to.clone_into(name);
1410 }
1411 rename_field_in(content, from, to);
1412 }
1413 Production::Choice { members } | Production::Seq { members } => {
1414 for m in members {
1415 rename_field_in(m, from, to);
1416 }
1417 }
1418 Production::Repeat { content }
1419 | Production::Repeat1 { content }
1420 | Production::Optional { content }
1421 | Production::Alias { content, .. }
1422 | Production::Token { content }
1423 | Production::ImmediateToken { content }
1424 | Production::Prec { content, .. }
1425 | Production::PrecLeft { content, .. }
1426 | Production::PrecRight { content, .. }
1427 | Production::PrecDynamic { content, .. }
1428 | Production::Reserved { content, .. } => rename_field_in(content, from, to),
1429 _ => {}
1430 }
1431}
1432
1433pub(crate) fn collect_field_symbols(
1434 prod: &Production,
1435 field_map: &mut std::collections::HashMap<String, Vec<String>>,
1436 non_field: &mut Vec<String>,
1437 inside_field: bool,
1438) {
1439 match prod {
1440 Production::Symbol { name } if !inside_field => {
1441 non_field.push(name.clone());
1442 }
1443 Production::Field { name, content } => {
1444 let mut syms = Vec::new();
1445 collect_symbols_flat(content, &mut syms);
1446 field_map.entry(name.clone()).or_default().extend(syms);
1447 }
1448 Production::Choice { members } | Production::Seq { members } => {
1449 for m in members {
1450 collect_field_symbols(m, field_map, non_field, inside_field);
1451 }
1452 }
1453 Production::Repeat { content }
1454 | Production::Repeat1 { content }
1455 | Production::Optional { content }
1456 | Production::Alias { content, .. }
1457 | Production::Token { content }
1458 | Production::ImmediateToken { content }
1459 | Production::Prec { content, .. }
1460 | Production::PrecLeft { content, .. }
1461 | Production::PrecRight { content, .. }
1462 | Production::PrecDynamic { content, .. }
1463 | Production::Reserved { content, .. } => {
1464 collect_field_symbols(content, field_map, non_field, inside_field);
1465 }
1466 _ => {}
1467 }
1468}
1469
1470pub(crate) fn collect_symbols_flat(prod: &Production, out: &mut Vec<String>) {
1471 match prod {
1472 Production::Symbol { name } => out.push(name.clone()),
1473 Production::Choice { members } | Production::Seq { members } => {
1474 for m in members {
1475 collect_symbols_flat(m, out);
1476 }
1477 }
1478 Production::Repeat { content }
1479 | Production::Repeat1 { content }
1480 | Production::Optional { content }
1481 | Production::Alias { content, .. }
1482 | Production::Field { content, .. }
1483 | Production::Token { content }
1484 | Production::ImmediateToken { content }
1485 | Production::Prec { content, .. }
1486 | Production::PrecLeft { content, .. }
1487 | Production::PrecRight { content, .. }
1488 | Production::PrecDynamic { content, .. }
1489 | Production::Reserved { content, .. } => collect_symbols_flat(content, out),
1490 _ => {}
1491 }
1492}
1493
1494/// Build a map from external scanner symbol names to their anonymous
1495/// ALIAS values by walking every rule body in the grammar.
1496pub(crate) fn build_external_alias_map(
1497 grammar: &Grammar,
1498) -> std::collections::HashMap<String, String> {
1499 let mut map = std::collections::HashMap::new();
1500 fn walk(
1501 grammar: &Grammar,
1502 prod: &Production,
1503 map: &mut std::collections::HashMap<String, String>,
1504 ) {
1505 match prod {
1506 Production::Alias {
1507 content,
1508 named,
1509 value,
1510 } => {
1511 if !*named && !value.is_empty() {
1512 if let Production::Symbol { name } = content.as_ref() {
1513 if name.starts_with('_') && !grammar.rules.contains_key(name) {
1514 map.entry(name.clone()).or_insert_with(|| value.clone());
1515 }
1516 }
1517 }
1518 walk(grammar, content, map);
1519 }
1520 Production::Choice { members } | Production::Seq { members } => {
1521 for m in members {
1522 walk(grammar, m, map);
1523 }
1524 }
1525 Production::Repeat { content }
1526 | Production::Repeat1 { content }
1527 | Production::Optional { content }
1528 | Production::Field { content, .. }
1529 | Production::Token { content }
1530 | Production::ImmediateToken { content }
1531 | Production::Prec { content, .. }
1532 | Production::PrecLeft { content, .. }
1533 | Production::PrecRight { content, .. }
1534 | Production::PrecDynamic { content, .. }
1535 | Production::Reserved { content, .. } => walk(grammar, content, map),
1536 _ => {}
1537 }
1538 }
1539 for rule in grammar.rules.values() {
1540 walk(grammar, rule, &mut map);
1541 }
1542 map
1543}
1544
1545/// Build a map from named-alias values to their source symbol names.
1546/// When tree-sitter emits a vertex with kind `V` via
1547/// `alias($.source, $.V)`, the grammar has no rule keyed by `V`.
1548/// This map lets the emitter resolve `V → source` and walk the source
1549/// rule with proper token roles and bracket pairs.
1550pub(crate) fn build_named_alias_map(
1551 grammar: &Grammar,
1552) -> std::collections::HashMap<String, String> {
1553 let mut map = std::collections::HashMap::new();
1554 fn walk(prod: &Production, map: &mut std::collections::HashMap<String, String>) {
1555 match prod {
1556 Production::Alias {
1557 content,
1558 named,
1559 value,
1560 } => {
1561 if *named && !value.is_empty() {
1562 if let Production::Symbol { name } = content.as_ref() {
1563 map.entry(value.clone()).or_insert_with(|| name.clone());
1564 }
1565 }
1566 walk(content, map);
1567 }
1568 Production::Choice { members } | Production::Seq { members } => {
1569 for m in members {
1570 walk(m, map);
1571 }
1572 }
1573 Production::Repeat { content }
1574 | Production::Repeat1 { content }
1575 | Production::Optional { content }
1576 | Production::Field { content, .. }
1577 | Production::Token { content }
1578 | Production::ImmediateToken { content }
1579 | Production::Prec { content, .. }
1580 | Production::PrecLeft { content, .. }
1581 | Production::PrecRight { content, .. }
1582 | Production::PrecDynamic { content, .. }
1583 | Production::Reserved { content, .. } => walk(content, map),
1584 _ => {}
1585 }
1586 }
1587 for rule in grammar.rules.values() {
1588 walk(rule, &mut map);
1589 }
1590 map
1591}
1592
1593/// Every source rule that aliases to each kind, deduplicated, in
1594/// first-seen order. Unlike [`build_named_alias_map`] (which keeps only
1595/// the first source per kind) this records the full set, so the emitter
1596/// can choose, among several rules that collapse to the same surface
1597/// kind, the one whose production admits a given vertex's children.
1598pub(crate) fn build_named_alias_sources(
1599 grammar: &Grammar,
1600) -> std::collections::HashMap<String, Vec<String>> {
1601 let mut map: std::collections::HashMap<String, Vec<String>> = std::collections::HashMap::new();
1602 fn walk(prod: &Production, map: &mut std::collections::HashMap<String, Vec<String>>) {
1603 match prod {
1604 Production::Alias {
1605 content,
1606 named,
1607 value,
1608 } => {
1609 if *named && !value.is_empty() {
1610 if let Production::Symbol { name } = content.as_ref() {
1611 let srcs = map.entry(value.clone()).or_default();
1612 if !srcs.contains(name) {
1613 srcs.push(name.clone());
1614 }
1615 }
1616 }
1617 walk(content, map);
1618 }
1619 Production::Choice { members } | Production::Seq { members } => {
1620 for m in members {
1621 walk(m, map);
1622 }
1623 }
1624 Production::Repeat { content }
1625 | Production::Repeat1 { content }
1626 | Production::Optional { content }
1627 | Production::Field { content, .. }
1628 | Production::Token { content }
1629 | Production::ImmediateToken { content }
1630 | Production::Prec { content, .. }
1631 | Production::PrecLeft { content, .. }
1632 | Production::PrecRight { content, .. }
1633 | Production::PrecDynamic { content, .. }
1634 | Production::Reserved { content, .. } => walk(content, map),
1635 _ => {}
1636 }
1637 }
1638 for rule in grammar.rules.values() {
1639 walk(rule, &mut map);
1640 }
1641 map
1642}
1643
1644/// Compute token roles for every STRING value in every grammar rule.
1645///
1646/// For each rule R, analyzes the production body to classify every
1647/// STRING token by its structural role (bracket-open, bracket-close,
1648/// separator, keyword, operator). Also identifies which bracket-open
1649/// tokens trigger indentation (those with REPEAT/REPEAT1 between
1650/// the open and close).
1651///
1652/// Bracket pairs are detected per-SEQ, not from a fixed character
1653/// set. Two STRINGs are a matched pair iff they are the first and
1654/// last STRING-typed members of the same SEQ with at least one
1655/// non-STRING member between them and open != close.
1656pub(crate) type RoleMap =
1657 std::collections::HashMap<String, std::collections::HashMap<String, TokenRole>>;
1658
1659pub(crate) type IndentSet = std::collections::HashSet<(String, String)>;
1660
1661pub(crate) fn compute_token_roles(grammar: &Grammar) -> (RoleMap, IndentSet) {
1662 use std::collections::{HashMap, HashSet};
1663 let mut all_roles: HashMap<String, HashMap<String, TokenRole>> = HashMap::new();
1664 let mut indent_triggers: HashSet<(String, String)> = HashSet::new();
1665
1666 for (rule_name, rule) in &grammar.rules {
1667 let mut roles: HashMap<String, TokenRole> = HashMap::new();
1668 classify_production(rule, &mut roles, &mut indent_triggers, rule_name);
1669 if !roles.is_empty() {
1670 all_roles.insert(rule_name.clone(), roles);
1671 }
1672 }
1673
1674 (all_roles, indent_triggers)
1675}
1676
1677/// Recursively classify STRING tokens in a production body.
1678pub(crate) fn classify_production(
1679 prod: &Production,
1680 roles: &mut std::collections::HashMap<String, TokenRole>,
1681 indent_triggers: &mut std::collections::HashSet<(String, String)>,
1682 rule_name: &str,
1683) {
1684 match prod {
1685 Production::Seq { members } => {
1686 classify_seq(members, roles, indent_triggers, rule_name, false);
1687 }
1688 Production::Choice { members } => {
1689 for m in members {
1690 // CHOICE alternatives' SEQs get in_choice=true so that
1691 // position-0 STRINGs are classified as Operators (not
1692 // prefix sigils). E.g. `=` in `CHOICE [SEQ ["=", ...]]`
1693 // is an operator, not a prefix.
1694 match m {
1695 Production::Seq {
1696 members: seq_members,
1697 } => {
1698 classify_seq(seq_members, roles, indent_triggers, rule_name, true);
1699 }
1700 _ => classify_production(m, roles, indent_triggers, rule_name),
1701 }
1702 }
1703 }
1704 Production::Repeat { content } | Production::Repeat1 { content } => {
1705 classify_repeat_body(content, roles, indent_triggers, rule_name);
1706 }
1707 Production::Optional { content }
1708 | Production::Field { content, .. }
1709 | Production::Token { content }
1710 | Production::ImmediateToken { content }
1711 | Production::Prec { content, .. }
1712 | Production::PrecLeft { content, .. }
1713 | Production::PrecRight { content, .. }
1714 | Production::PrecDynamic { content, .. }
1715 | Production::Reserved { content, .. } => {
1716 classify_production(content, roles, indent_triggers, rule_name);
1717 }
1718 Production::Alias { content, .. } => {
1719 classify_production(content, roles, indent_triggers, rule_name);
1720 }
1721 _ => {}
1722 }
1723}
1724
1725/// Classify STRING tokens within a SEQ. This is where bracket pairs
1726/// are detected and roles assigned.
1727pub(crate) fn classify_seq(
1728 members: &[Production],
1729 roles: &mut std::collections::HashMap<String, TokenRole>,
1730 indent_triggers: &mut std::collections::HashSet<(String, String)>,
1731 rule_name: &str,
1732 in_choice: bool,
1733) {
1734 let string_positions: Vec<(usize, &str)> = members
1735 .iter()
1736 .enumerate()
1737 .filter_map(|(i, m)| unwrap_to_string(m).map(|s| (i, s)))
1738 .collect();
1739
1740 let content_count = members
1741 .iter()
1742 .filter(|m| unwrap_to_string(m).is_none())
1743 .count();
1744
1745 if string_positions.len() >= 2 {
1746 let (first_idx, first_val) = string_positions[0];
1747 let (last_idx, last_val) = string_positions[string_positions.len() - 1];
1748
1749 let has_content_between = members[first_idx + 1..last_idx]
1750 .iter()
1751 .any(|m| unwrap_to_string(m).is_none());
1752
1753 let both_punct = !is_word_like(first_val) && !is_word_like(last_val);
1754 let both_word = is_word_like(first_val) && is_word_like(last_val);
1755 if has_content_between && first_val != last_val && (both_punct || both_word) {
1756 roles.insert(first_val.to_owned(), TokenRole::BracketOpen);
1757 roles.insert(last_val.to_owned(), TokenRole::BracketClose);
1758
1759 let between = &members[first_idx + 1..last_idx];
1760 if first_val == "{" && has_repeat_recursive(between) {
1761 indent_triggers.insert((rule_name.to_owned(), first_val.to_owned()));
1762 }
1763 }
1764 }
1765
1766 // An optional leading unary sign (`CHOICE[- | BLANK]` / `OPTIONAL(-)`
1767 // at the head of the SEQ, with an operand after it) is a tight prefix
1768 // on that operand: `signed_number = SEQ[CHOICE[- | BLANK], number]`
1769 // emits `-1.0`, not `- 1.0`. The sign lives inside a CHOICE, so the
1770 // per-position pass below never sees it; classify it here.
1771 if members.len() >= 2 {
1772 if let Some(first) = members.first() {
1773 let has_following_content = members[1..].iter().any(|m| unwrap_to_string(m).is_none());
1774 if has_following_content {
1775 for sign in leading_optional_sign(first) {
1776 roles.entry(sign).or_insert(TokenRole::BracketOpen);
1777 }
1778 }
1779 }
1780 }
1781
1782 // Classify remaining STRINGs by structural position.
1783 let first_content_idx = members.iter().position(|m| unwrap_to_string(m).is_none());
1784 let last_content_idx = members.iter().rposition(|m| unwrap_to_string(m).is_none());
1785
1786 for (i, m) in members.iter().enumerate() {
1787 if let Some(value) = unwrap_to_string(m) {
1788 let value = value.to_owned();
1789 if !roles.contains_key(&value) {
1790 if is_word_like(&value) {
1791 roles.insert(value.clone(), TokenRole::Keyword);
1792 } else if !in_choice
1793 && first_content_idx.is_some_and(|fc| i < fc)
1794 && is_prefix_sigil(&value)
1795 {
1796 roles.insert(value.clone(), TokenRole::BracketOpen);
1797 } else if last_content_idx.is_some_and(|lc| i > lc) {
1798 // STRING after all content: suffix (tight before).
1799 // Unlike prefix, this applies in CHOICE branches too
1800 // (e.g. `()` in bash function_definition's CHOICE).
1801 roles.insert(value.clone(), TokenRole::BracketClose);
1802 } else if !in_choice
1803 && string_positions.len() == 1
1804 && content_count == 2
1805 && value.len() == 1
1806 {
1807 // Single-character STRING between exactly two content
1808 // members in a non-CHOICE SEQ: this is a connector
1809 // (e.g. `.` in `SEQ [object, ".", attr]`).
1810 // Multi-character tokens like `:=`, `<-`, `->` are
1811 // operators (spaced), not connectors.
1812 roles.insert(value.clone(), TokenRole::Connector);
1813 } else {
1814 roles.insert(value.clone(), TokenRole::Operator);
1815 }
1816 }
1817 }
1818 }
1819
1820 for m in members {
1821 if unwrap_to_string(m).is_none() {
1822 classify_production(m, roles, indent_triggers, rule_name);
1823 }
1824 }
1825}
1826
1827/// Classify STRING tokens in a REPEAT body. The first STRING in a
1828/// REPEAT body's inner SEQ is a separator (e.g. `,` in
1829/// `REPEAT(SEQ [",", item])`).
1830pub(crate) fn classify_repeat_body(
1831 content: &Production,
1832 roles: &mut std::collections::HashMap<String, TokenRole>,
1833 indent_triggers: &mut std::collections::HashSet<(String, String)>,
1834 rule_name: &str,
1835) {
1836 match content {
1837 Production::Seq { members } => {
1838 if let Some(Production::String { value }) = members.first() {
1839 roles.insert(value.clone(), TokenRole::Separator);
1840 }
1841 classify_seq(members, roles, indent_triggers, rule_name, false);
1842 }
1843 _ => classify_production(content, roles, indent_triggers, rule_name),
1844 }
1845}
1846
1847/// Classify STRING tokens within a SEQ by structural position, returning
1848/// a role for each member position. Non-STRING positions get `None`.
1849/// This is the inline variant of `classify_seq` used at emission time
1850/// to avoid the flat per-rule map's conflation of same-text tokens.
1851pub(crate) fn classify_seq_positions(
1852 members: &[Production],
1853 in_choice: bool,
1854) -> Vec<Option<TokenRole>> {
1855 let mut roles: Vec<Option<TokenRole>> = vec![None; members.len()];
1856
1857 let string_positions: Vec<(usize, &str)> = members
1858 .iter()
1859 .enumerate()
1860 .filter_map(|(i, m)| unwrap_to_string(m).map(|s| (i, s)))
1861 .collect();
1862
1863 let content_count = members
1864 .iter()
1865 .filter(|m| unwrap_to_string(m).is_none())
1866 .count();
1867
1868 // Bracket pair detection.
1869 let mut bracket_open_idx: Option<usize> = None;
1870 let mut bracket_close_idx: Option<usize> = None;
1871
1872 // Canonical pairing first: pair an actual `(`/`[`/`{` with its matching
1873 // closer, even when other STRINGs (a prefix operator, a trailing `;`)
1874 // sit at the SEQ ends. `sampling_statement` (`expr ~ f ( args ) ;`)
1875 // must pair `(`/`)`, not `~`/`;`.
1876 for &(oi, ov) in &string_positions {
1877 let Some(close_text) = matching_close_bracket(ov) else {
1878 continue;
1879 };
1880 if let Some(&(ci, _)) = string_positions
1881 .iter()
1882 .rev()
1883 .find(|(_, v)| *v == close_text)
1884 {
1885 if oi < ci
1886 && members[oi + 1..ci]
1887 .iter()
1888 .any(|m| unwrap_to_string(m).is_none())
1889 {
1890 roles[oi] = Some(TokenRole::BracketOpen);
1891 roles[ci] = Some(TokenRole::BracketClose);
1892 bracket_open_idx = Some(oi);
1893 bracket_close_idx = Some(ci);
1894 break;
1895 }
1896 }
1897 }
1898
1899 // First/last STRING fallback: handles word-like pairs (begin/end) and
1900 // same-text immediate delimiters (regex `/.../`) that the canonical
1901 // search does not recognise.
1902 if bracket_open_idx.is_none() && string_positions.len() >= 2 {
1903 let (first_idx, first_val) = string_positions[0];
1904 let (last_idx, last_val) = string_positions[string_positions.len() - 1];
1905
1906 let has_content_between = members[first_idx + 1..last_idx]
1907 .iter()
1908 .any(|m| unwrap_to_string(m).is_none());
1909
1910 let both_punct = !is_word_like(first_val) && !is_word_like(last_val);
1911 let both_word = is_word_like(first_val) && is_word_like(last_val);
1912 // Same-text delimiters (e.g. regex `/.../`) are a bracket pair
1913 // when at least one side is IMMEDIATE_TOKEN — the grammar's
1914 // structural signal that the delimiter must be tight against
1915 // the content.
1916 let either_immediate =
1917 is_immediate_token(&members[first_idx]) || is_immediate_token(&members[last_idx]);
1918 let same_text_immediate = first_val == last_val && either_immediate;
1919 if has_content_between
1920 && (both_punct || both_word)
1921 && (first_val != last_val || same_text_immediate)
1922 {
1923 roles[first_idx] = Some(TokenRole::BracketOpen);
1924 roles[last_idx] = Some(TokenRole::BracketClose);
1925 bracket_open_idx = Some(first_idx);
1926 bracket_close_idx = Some(last_idx);
1927 }
1928 }
1929
1930 let first_content_idx = members.iter().position(|m| unwrap_to_string(m).is_none());
1931 let last_content_idx = members.iter().rposition(|m| unwrap_to_string(m).is_none());
1932
1933 for (i, m) in members.iter().enumerate() {
1934 if roles[i].is_some() {
1935 continue;
1936 }
1937 if let Some(value) = unwrap_to_string(m) {
1938 roles[i] = Some(if is_immediate_token(m) {
1939 // The grammar wraps this token in IMMEDIATE_TOKEN: the
1940 // lexer glues it to its neighbours (the `.` in a float
1941 // `0.5`). Derive tightness from that fact rather than
1942 // guessing from position.
1943 TokenRole::Immediate
1944 } else if is_word_like(value) {
1945 TokenRole::Keyword
1946 } else if !in_choice && first_content_idx.is_some_and(|fc| i < fc) {
1947 if is_prefix_sigil(value) {
1948 TokenRole::BracketOpen
1949 } else {
1950 TokenRole::Operator
1951 }
1952 } else if last_content_idx.is_some_and(|lc| i > lc) {
1953 TokenRole::BracketClose
1954 } else if !in_choice
1955 && string_positions.len() == 1
1956 && content_count == 2
1957 && value.len() == 1
1958 {
1959 TokenRole::Connector
1960 } else {
1961 TokenRole::Operator
1962 });
1963 }
1964 }
1965
1966 // Override: in a REPEAT body's inner SEQ, the first STRING is a
1967 // separator. This is checked by the caller (REPEAT handler), not here.
1968 // But we do store bracket indices for the caller to use.
1969 let _ = (bracket_open_idx, bracket_close_idx);
1970
1971 roles
1972}
1973
1974/// Extract line-comment prefixes from the grammar's extras rules.
1975///
1976/// A line comment is identified by: the rule name is in
1977/// `grammar.extras` AND the rule body structurally matches
1978/// `TOKEN(SEQ [STRING prefix, PATTERN ...])` where the PATTERN
1979/// matches to end-of-line.
1980pub(crate) fn extract_line_comment_prefixes(grammar: &Grammar) -> Vec<String> {
1981 let mut prefixes = Vec::new();
1982 for extra_name in &grammar.extras {
1983 if let Some(rule) = grammar.rules.get(extra_name) {
1984 if let Some(prefix) = extract_line_comment_prefix(rule) {
1985 prefixes.push(prefix);
1986 }
1987 }
1988 }
1989 prefixes
1990}
1991
1992/// Does this production, after resolving SYMBOL references and unwrapping
1993/// transparent wrappers, BEGIN with a newline-only token? Used to detect
1994/// the trailing element of a hard-line-break SEQ (`_soft_line_break`,
1995/// `_newline_token`, a bare `\n` STRING/PATTERN).
1996fn production_is_newline_leading(
1997 grammar: &Grammar,
1998 prod: &Production,
1999 seen: &mut std::collections::HashSet<String>,
2000) -> bool {
2001 match prod {
2002 Production::String { value } | Production::Pattern { value } => {
2003 is_newline_like_pattern(value)
2004 }
2005 Production::Seq { members } => members
2006 .first()
2007 .is_some_and(|m| production_is_newline_leading(grammar, m, seen)),
2008 Production::Symbol { name } => {
2009 if !seen.insert(name.clone()) {
2010 return false;
2011 }
2012 grammar
2013 .rules
2014 .get(name)
2015 .is_some_and(|r| production_is_newline_leading(grammar, r, seen))
2016 }
2017 Production::Token { content }
2018 | Production::ImmediateToken { content }
2019 | Production::Prec { content, .. }
2020 | Production::PrecLeft { content, .. }
2021 | Production::PrecRight { content, .. }
2022 | Production::PrecDynamic { content, .. }
2023 | Production::Field { content, .. }
2024 | Production::Alias { content, .. }
2025 | Production::Reserved { content, .. } => {
2026 production_is_newline_leading(grammar, content, seen)
2027 }
2028 _ => false,
2029 }
2030}
2031
2032/// Collect the leading "hard line break" markers (see
2033/// [`Grammar::trailing_break_markers`]).
2034///
2035/// A production `SEQ[first, .., last]` is a hard-line-break idiom when its
2036/// LAST member is newline-leading. The FIRST member then carries the break
2037/// marker: a CHOICE (or bare element) whose alternatives are either a
2038/// single-character non-alphanumeric STRING (`\`) — a standalone break
2039/// marker — or a whitespace-only PATTERN (`[ \t]+`). The former is
2040/// collected literally; the latter sets the whitespace-sensitivity flag.
2041pub(crate) fn classify_trailing_break_markers(grammar: &Grammar) -> (Vec<String>, bool) {
2042 fn collect_marker_alts(
2043 grammar: &Grammar,
2044 prod: &Production,
2045 lits: &mut Vec<String>,
2046 ws: &mut bool,
2047 seen: &mut std::collections::HashSet<String>,
2048 ) {
2049 match prod {
2050 Production::Choice { members } => {
2051 for m in members {
2052 collect_marker_alts(grammar, m, lits, ws, seen);
2053 }
2054 }
2055 Production::Symbol { name } => {
2056 // Resolve hidden break-marker symbols (`_whitespace_ge_2`)
2057 // one level, cycle-guarded. A concrete named rule is line
2058 // content in its own right, never a bare break marker.
2059 if let Some(r) = grammar
2060 .rules
2061 .get(name)
2062 .filter(|_| seen.insert(name.clone()))
2063 {
2064 collect_marker_alts(grammar, r, lits, ws, seen);
2065 }
2066 }
2067 Production::Token { content }
2068 | Production::ImmediateToken { content }
2069 | Production::Prec { content, .. }
2070 | Production::PrecLeft { content, .. }
2071 | Production::PrecRight { content, .. }
2072 | Production::PrecDynamic { content, .. }
2073 | Production::Field { content, .. }
2074 | Production::Alias { content, .. }
2075 | Production::Reserved { content, .. } => {
2076 collect_marker_alts(grammar, content, lits, ws, seen);
2077 }
2078 Production::String { value } => {
2079 // A standalone single-character non-alphanumeric break marker
2080 // (the `\` of a hard line break). Multi-character / word-like
2081 // literals are substantive line content, not break markers.
2082 let mut chars = value.chars();
2083 if let (Some(c), None) = (chars.next(), chars.clone().next()) {
2084 if !c.is_alphanumeric() && !c.is_whitespace() {
2085 lits.push(value.clone());
2086 }
2087 }
2088 }
2089 Production::Pattern { value } => {
2090 // A whitespace-only PATTERN, possibly a top-level alternation
2091 // of whitespace branches (`_whitespace_ge_2 = \t| [ \t]+`).
2092 let ws_only = super::split_top_level_alternation(value)
2093 .iter()
2094 .all(|b| is_whitespace_only_pattern(b.trim()));
2095 if ws_only {
2096 *ws = true;
2097 }
2098 }
2099 _ => {}
2100 }
2101 }
2102
2103 let mut lits: Vec<String> = Vec::new();
2104 let mut ws = false;
2105 for rule in grammar.rules.values() {
2106 if let Production::Seq { members } = rule {
2107 if members.len() >= 2
2108 && production_is_newline_leading(
2109 grammar,
2110 members.last().expect("len >= 2"),
2111 &mut std::collections::HashSet::new(),
2112 )
2113 {
2114 collect_marker_alts(
2115 grammar,
2116 &members[0],
2117 &mut lits,
2118 &mut ws,
2119 &mut std::collections::HashSet::new(),
2120 );
2121 }
2122 }
2123 }
2124 lits.sort();
2125 lits.dedup();
2126 (lits, ws)
2127}
2128
2129/// Extract the start symbol (first rule key) from raw grammar.json bytes.
2130///
2131/// tree-sitter's start symbol is the first rule as written, but the
2132/// [`Grammar::rules`] `BTreeMap` reorders alphabetically. We recover the
2133/// original first key with a minimal scan: locate the top-level `"rules"`
2134/// object and read its first JSON string key. Returns an empty string on
2135/// any malformed input (the caller then falls back to no special-casing).
2136fn extract_start_symbol(bytes: &[u8]) -> String {
2137 let Ok(text) = std::str::from_utf8(bytes) else {
2138 return String::new();
2139 };
2140 let Some(rules_at) = text.find("\"rules\"") else {
2141 return String::new();
2142 };
2143 let after = &text[rules_at + "\"rules\"".len()..];
2144 // Skip to the object's opening brace.
2145 let Some(brace) = after.find('{') else {
2146 return String::new();
2147 };
2148 let mut chars = after[brace + 1..].char_indices();
2149 // Find the first string key.
2150 for (_, c) in chars.by_ref() {
2151 if c == '"' {
2152 break;
2153 }
2154 if !c.is_whitespace() {
2155 return String::new();
2156 }
2157 }
2158 let mut key = String::new();
2159 while let Some((_, c)) = chars.next() {
2160 match c {
2161 // Skip the escaped character after a backslash (rule names are
2162 // plain identifiers, but stay robust to escapes).
2163 '\\' => {
2164 chars.next();
2165 }
2166 '"' => return key,
2167 _ => key.push(c),
2168 }
2169 }
2170 String::new()
2171}
2172
2173/// Does this PATTERN value match a bare newline? True when some top-level
2174/// alternation branch is a negated character class (`[^...]`, optionally
2175/// quantified or anchored to a following atom) that does NOT exclude `\n`
2176/// or `\r` — the free-text-content idiom (`liquid`'s `[^{]+`).
2177fn pattern_admits_newline(value: &str) -> bool {
2178 for branch in super::split_top_level_alternation(value) {
2179 let b = branch.trim();
2180 if let Some(rest) = b.strip_prefix("[^") {
2181 if let Some(idx) = rest.find(']') {
2182 let inner = &rest[..idx];
2183 // A negated class admits newline unless it explicitly lists
2184 // a newline atom (`\n` / `\r`).
2185 if !inner.contains("\\n") && !inner.contains("\\r") {
2186 return true;
2187 }
2188 }
2189 }
2190 }
2191 false
2192}
2193
2194/// Determine whether the grammar's top-level document repeat directly
2195/// admits a free-text content node matching a bare newline (see
2196/// [`Grammar::top_level_text_admits_newline`]).
2197pub(crate) fn classify_top_level_text_admits_newline(grammar: &Grammar) -> bool {
2198 // The start symbol is the FIRST rule as written in grammar.json.
2199 let Some(start_body) = grammar.rules.get(&grammar.start_symbol) else {
2200 return false;
2201 };
2202
2203 // Collect the concrete content kinds that are DIRECT members of the
2204 // start symbol's top-level repeat, descending through SEQ / REPEAT /
2205 // CHOICE / OPTIONAL and resolving hidden (`_`-prefixed) symbols. A
2206 // concrete (non-hidden) symbol terminates the descent: it is a
2207 // document node whose own body we then inspect for free text.
2208 fn collect_content_kinds(
2209 grammar: &Grammar,
2210 prod: &Production,
2211 out: &mut std::collections::HashSet<String>,
2212 seen: &mut std::collections::HashSet<String>,
2213 ) {
2214 match prod {
2215 Production::Seq { members } | Production::Choice { members } => {
2216 for m in members {
2217 collect_content_kinds(grammar, m, out, seen);
2218 }
2219 }
2220 Production::Repeat { content }
2221 | Production::Repeat1 { content }
2222 | Production::Optional { content }
2223 | Production::Token { content }
2224 | Production::ImmediateToken { content }
2225 | Production::Prec { content, .. }
2226 | Production::PrecLeft { content, .. }
2227 | Production::PrecRight { content, .. }
2228 | Production::PrecDynamic { content, .. }
2229 | Production::Field { content, .. }
2230 | Production::Reserved { content, .. } => {
2231 collect_content_kinds(grammar, content, out, seen);
2232 }
2233 Production::Symbol { name } => {
2234 if name.starts_with('_') {
2235 // Hidden rule: descend through it (inlined by tree-sitter).
2236 if seen.insert(name.clone()) {
2237 if let Some(r) = grammar.rules.get(name) {
2238 collect_content_kinds(grammar, r, out, seen);
2239 }
2240 }
2241 } else {
2242 // Concrete document node: record, do not descend.
2243 out.insert(name.clone());
2244 }
2245 }
2246 _ => {}
2247 }
2248 }
2249
2250 let mut kinds = std::collections::HashSet::new();
2251 collect_content_kinds(
2252 grammar,
2253 start_body,
2254 &mut kinds,
2255 &mut std::collections::HashSet::new(),
2256 );
2257
2258 // A content kind admits a trailing newline when its body is (or is a
2259 // REPEAT1 of) a free-text PATTERN matching a bare newline.
2260 fn body_admits_newline_text(prod: &Production) -> bool {
2261 match prod {
2262 Production::Pattern { value } => pattern_admits_newline(value),
2263 Production::Repeat1 { content } | Production::Repeat { content } => {
2264 body_admits_newline_text(content)
2265 }
2266 Production::Choice { members } => members.iter().any(body_admits_newline_text),
2267 Production::Token { content } | Production::ImmediateToken { content } => {
2268 body_admits_newline_text(content)
2269 }
2270 _ => false,
2271 }
2272 }
2273
2274 kinds
2275 .iter()
2276 .any(|k| grammar.rules.get(k).is_some_and(body_admits_newline_text))
2277}
2278
2279// ═══════════════════════════════════════════════════════════════════
2280// Format policy
2281/// Classify external scanner tokens as layout actions based on
2282/// tree-sitter naming conventions. These conventions are ecosystem-
2283/// wide, not language-specific: every indent-based grammar uses
2284/// `_indent`/`_dedent`, every ASI grammar uses `_automatic_semicolon`.
2285pub(crate) fn classify_external_layout_tokens(grammar: &mut Grammar) {
2286 // External tokens have no grammar rule. We identify them by
2287 // checking which hidden symbols are NOT in grammar.rules.
2288 // Then classify by naming convention.
2289 //
2290 // This runs after external_alias_map is built, so tokens with
2291 // known text are already handled. Layout tokens are the remainder.
2292 let all_hidden_refs = collect_all_symbol_refs(&grammar.rules);
2293 for name in &all_hidden_refs {
2294 if !name.starts_with('_') || grammar.rules.contains_key(name) {
2295 continue;
2296 }
2297 if grammar.external_alias_map.contains_key(name) {
2298 continue;
2299 }
2300 if name == "_indent" || name.ends_with("_indent") {
2301 grammar.external_indent_opens.insert(name.clone());
2302 } else if name == "_dedent" || name.ends_with("_dedent") {
2303 grammar.external_indent_closes.insert(name.clone());
2304 } else if name.contains("line_ending")
2305 || name.contains("newline")
2306 || name.ends_with("_or_eof")
2307 // `_automatic_separator` is the tree-sitter ASI convention for a
2308 // scanner-inserted statement terminator that is a NEWLINE (V's
2309 // statement separator between `*ap = size` and the prior stmt).
2310 // Unclassified, it emitted nothing and merged adjacent statements
2311 // onto one line, so `& u64(a)` `*ap` re-lexed as a multiplication.
2312 || name.contains("automatic_separator")
2313 // The tree-sitter ASI / layout-rule families surface a statement
2314 // terminator that the scanner inserts where the source wrote a
2315 // NEWLINE (or end-of-construct), never a literal `;` -- the written
2316 // `;` is always a separate STRING token (`_semicolon =
2317 // CHOICE[_automatic_semicolon, ";"]` in JS/perl, or no `;` at all
2318 // in kotlin where `_semi = _automatic_semicolon`). Emitting these
2319 // as a literal `;` corrupts content: kotlin `enum {...}` (the
2320 // mandatory `source_file` `_semi` after the class) accreted a
2321 // spurious trailing `;`. They are newlines.
2322 || name.contains("automatic_semicolon")
2323 || name.contains("layout_semicolon")
2324 {
2325 grammar.external_newlines.insert(name.clone());
2326 } else if name.contains("semicolon") {
2327 grammar.external_semicolons.insert(name.clone());
2328 }
2329 }
2330}
2331
2332/// Identify external scanner tokens that bracket content, derived from
2333/// grammar structure: a rule whose (unwrapped) body is a SEQ whose first
2334/// and last members are external SYMBOLs (no grammar rule of their own)
2335/// with content in between is a delimiter pair around that content. The
2336/// canonical case is `string = SEQ[string_start, REPEAT(content),
2337/// string_end]`: the delimiters must hug the content (`'hello'`), and
2338/// the grammar states which externals they are without naming
2339/// conventions.
2340pub(crate) fn classify_external_bracket_delimiters(grammar: &mut Grammar) {
2341 let is_external = |name: &str| !grammar.rules.contains_key(name);
2342 let mut opens = std::collections::HashSet::new();
2343 let mut closes = std::collections::HashSet::new();
2344 let mut content_kinds = std::collections::HashSet::new();
2345 for rule in grammar.rules.values() {
2346 let Production::Seq { members } = unwrap_to_seq(rule) else {
2347 continue;
2348 };
2349 if members.len() < 3 {
2350 continue;
2351 }
2352 let (Some(first), Some(last)) = (members.first(), members.last()) else {
2353 continue;
2354 };
2355 // The delimiter may be a bare external SYMBOL or an anonymous ALIAS
2356 // renaming one (ruby `ALIAS{_string_start, value:"\""}`); unwrap to
2357 // the underlying external name in both cases.
2358 let (Some(open), Some(close)) = (
2359 delimiter_external_name(first),
2360 delimiter_external_name(last),
2361 ) else {
2362 continue;
2363 };
2364 if open == close || !is_external(open) || !is_external(close) {
2365 continue;
2366 }
2367 // Content between the delimiters (a REPEAT of string content, an
2368 // interpolation choice, …) — at least one non-delimiter member.
2369 opens.insert(open.to_owned());
2370 closes.insert(close.to_owned());
2371 // The visible (non-`_`) external symbols reachable from the members
2372 // *between* the delimiters are the captured-content tokens (ruby
2373 // `string_content`/`heredoc_content`, regex content). They carry the
2374 // literal source bytes and must emit tight; collect them, expanding
2375 // hidden-rule references (ruby wraps the content in the hidden
2376 // `_literal_contents = REPEAT1(CHOICE[string_content, …])`).
2377 for member in &members[1..members.len() - 1] {
2378 collect_visible_external_content(grammar, member, &mut content_kinds, &mut Vec::new());
2379 }
2380 }
2381 grammar.external_bracket_opens = opens;
2382 grammar.external_bracket_closes = closes;
2383 grammar.external_content_kinds = content_kinds;
2384}
2385
2386/// Derive the emitted text for an external *closing* delimiter whose
2387/// matching *opener* is a literal `STRING`. A rule shaped `SEQ[STRING q,
2388/// body.., EXTERNAL close]` opens with a grammar literal but closes with a
2389/// scanner external that has no rule and no otherwise-resolvable text
2390/// (TOML's `_multiline_basic_string` / `_multiline_literal_string`). Such a
2391/// string closes with the same delimiter it opens with, so the external
2392/// close emits the opener's literal. Matches only the asymmetric
2393/// STRING-open / external-close shape (the all-external and all-STRING
2394/// shapes are handled by `classify_external_bracket_delimiters` /
2395/// `classify_string_content_kinds`); the close must be an external with no
2396/// rule, so ordinary bracketed constructs do not match.
2397pub(crate) fn classify_external_close_text(grammar: &mut Grammar) {
2398 let is_external = |name: &str| !grammar.rules.contains_key(name);
2399 let mut close_text = std::collections::HashMap::new();
2400 for rule in grammar.rules.values() {
2401 let Production::Seq { members } = unwrap_to_seq(rule) else {
2402 continue;
2403 };
2404 if members.len() < 3 {
2405 continue;
2406 }
2407 let (Some(first), Some(last)) = (members.first(), members.last()) else {
2408 continue;
2409 };
2410 // Opener: a literal STRING (possibly wrapped in a token/precedence
2411 // node) that is a *quote delimiter* — its first character is a
2412 // string quote (`"`, `'`, or a backtick). A keyword opener (FIRRTL
2413 // `memory = SEQ["mem", .., _dedent]`) is NOT a string delimiter, so
2414 // its trailing external is a layout token, not a closing quote.
2415 let Some(open) = string_literal_value(first) else {
2416 continue;
2417 };
2418 if !open.starts_with(['"', '\'', '`']) {
2419 continue;
2420 }
2421 // Closer: a bare external SYMBOL with no rule that is NOT a layout
2422 // token (a `_dedent` / `_newline` / `_indent` closing an indent
2423 // block or terminating a line is structural layout, not a string
2424 // terminator — emitting the open quote in its place corrupts the
2425 // block structure).
2426 let Some(close) = delimiter_external_name(last) else {
2427 continue;
2428 };
2429 if !is_external(close)
2430 || grammar.external_indent_closes.contains(close)
2431 || grammar.external_indent_opens.contains(close)
2432 || grammar.external_newlines.contains(close)
2433 || close.contains("indent")
2434 || close.contains("dedent")
2435 || close.contains("newline")
2436 || close.contains("line_ending")
2437 || close.ends_with("_or_eof")
2438 {
2439 continue;
2440 }
2441 close_text.insert(close.to_owned(), open.to_owned());
2442 }
2443 grammar.external_close_text = close_text;
2444}
2445
2446/// The literal `STRING` value a delimiter member resolves to, unwrapping
2447/// token / immediate-token / precedence wrappers. `None` for any other
2448/// shape.
2449fn string_literal_value(prod: &Production) -> Option<&str> {
2450 match prod {
2451 Production::String { value } => Some(value.as_str()),
2452 Production::Token { content }
2453 | Production::ImmediateToken { content }
2454 | Production::Prec { content, .. }
2455 | Production::PrecLeft { content, .. }
2456 | Production::PrecRight { content, .. }
2457 | Production::PrecDynamic { content, .. }
2458 | Production::Reserved { content, .. } => string_literal_value(content),
2459 _ => None,
2460 }
2461}
2462
2463/// The external scanner-token name a delimiter member resolves to: either
2464/// a bare `SYMBOL` or an anonymous `ALIAS` renaming one (`ALIAS{
2465/// _string_start, value: "\""}`). `None` for any other shape.
2466fn delimiter_external_name(prod: &Production) -> Option<&str> {
2467 match prod {
2468 Production::Symbol { name } => Some(name.as_str()),
2469 Production::Alias {
2470 content,
2471 named: false,
2472 ..
2473 } => external_symbol_name(content),
2474 _ => None,
2475 }
2476}
2477
2478/// Collect the visible (non-`_`-prefixed) external scanner tokens reachable
2479/// from a string-body production, expanding hidden-rule references (the
2480/// content is often wrapped in a hidden `_literal_contents` rule). A visible
2481/// external has no rule and no leading underscore. Cycle-guarded on the
2482/// hidden rules visited.
2483fn collect_visible_external_content<'g>(
2484 grammar: &'g Grammar,
2485 prod: &'g Production,
2486 out: &mut std::collections::HashSet<String>,
2487 visiting: &mut Vec<&'g str>,
2488) {
2489 match prod {
2490 Production::Symbol { name } => {
2491 if !grammar.rules.contains_key(name) {
2492 if !name.starts_with('_') {
2493 out.insert(name.clone());
2494 }
2495 } else if name.starts_with('_') && !visiting.contains(&name.as_str()) {
2496 // Expand a hidden rule reference to reach its content tokens.
2497 visiting.push(name.as_str());
2498 if let Some(rule) = grammar.rules.get(name) {
2499 collect_visible_external_content(grammar, rule, out, visiting);
2500 }
2501 visiting.pop();
2502 }
2503 }
2504 Production::Choice { members } | Production::Seq { members } => {
2505 for m in members {
2506 collect_visible_external_content(grammar, m, out, visiting);
2507 }
2508 }
2509 Production::Repeat { content }
2510 | Production::Repeat1 { content }
2511 | Production::Optional { content }
2512 | Production::Token { content }
2513 | Production::ImmediateToken { content }
2514 | Production::Prec { content, .. }
2515 | Production::PrecLeft { content, .. }
2516 | Production::PrecRight { content, .. }
2517 | Production::PrecDynamic { content, .. }
2518 | Production::Reserved { content, .. }
2519 | Production::Field { content, .. }
2520 | Production::Alias { content, .. } => {
2521 collect_visible_external_content(grammar, content, out, visiting);
2522 }
2523 Production::String { .. } | Production::Pattern { .. } | Production::Blank => {}
2524 }
2525}
2526
2527/// Classify the *content kinds* of literal-quote-delimited string rules:
2528/// rules shaped `SEQ[STRING q, body.., STRING q]` where the same quote
2529/// literal opens and closes (CSS `string_value`, C# / Java
2530/// `string_literal`). The body is a `REPEAT`/`REPEAT1` over a `CHOICE` of
2531/// content pieces (`string_content` aliased over a PATTERN,
2532/// `escape_sequence`); each piece carries verbatim source bytes and must
2533/// emit *tight* on both sides so the layout pass does not accrete a space
2534/// between adjacent content / escape leaves (`"ab\t"`, not `"ab \t "`).
2535///
2536/// This is the literal-delimiter twin of
2537/// [`classify_external_bracket_delimiters`], which only matches *external*
2538/// (scanner) delimiters and so skips these STRING-quoted rules. Both feed
2539/// the same tight-content emission path. The match is anchored on a
2540/// *matched quote pair* and a `REPEAT` body, so it does not fire for
2541/// ordinary bracketed constructs (`( … )`, `{ … }`) whose opener and
2542/// closer differ, nor for indent blocks (no STRING delimiter).
2543/// The `SEQ` alternatives of a rule: a single `SEQ` yields itself; a
2544/// top-level `CHOICE` yields each alternative (unwrapped through
2545/// precedence / token wrappers). The CSS `string_value` is a
2546/// `CHOICE[SEQ['…'], SEQ["…"]]` (one alternative per quote style), so a
2547/// string classifier must look at each branch.
2548fn seq_alternatives(rule: &Production) -> Vec<&Production> {
2549 match unwrap_to_seq(rule) {
2550 Production::Choice { members } => members.iter().map(unwrap_to_seq).collect(),
2551 other => vec![other],
2552 }
2553}
2554
2555/// True when a string-body member is *unbounded*: it is directly a
2556/// `REPEAT`/`REPEAT1`, or it references a named symbol whose own rule is an
2557/// unbounded content rule (reached through `CHOICE`/`BLANK`/`OPTIONAL`/`FIELD`
2558/// wrappers). This lets a `string` whose body delegates the open-ended content
2559/// to a separate rule (query's `string_content = REPEAT1(...)`) still register
2560/// as a quoted-string shape. One hop only, to keep the classifier cheap and
2561/// avoid following arbitrary rule graphs.
2562fn member_is_unbounded_body(
2563 prod: &Production,
2564 rules: &std::collections::BTreeMap<String, Production>,
2565) -> bool {
2566 match prod {
2567 Production::Repeat { .. } | Production::Repeat1 { .. } => true,
2568 Production::Symbol { name } => rules
2569 .get(name)
2570 .is_some_and(|r| matches!(r, Production::Repeat { .. } | Production::Repeat1 { .. })),
2571 Production::Choice { members } | Production::Seq { members } => {
2572 members.iter().any(|m| member_is_unbounded_body(m, rules))
2573 }
2574 Production::Optional { content } | Production::Field { content, .. } => {
2575 member_is_unbounded_body(content, rules)
2576 }
2577 _ => false,
2578 }
2579}
2580
2581pub(crate) fn classify_string_content_kinds(grammar: &mut Grammar) {
2582 let mut accum = StringContentAccum::new();
2583 for rule in grammar.rules.values() {
2584 for seq in seq_alternatives(rule) {
2585 let Production::Seq { members } = seq else {
2586 continue;
2587 };
2588 if members.len() < 3 {
2589 continue;
2590 }
2591 // The opener is the first member and must be a literal quote
2592 // STRING (`'…'`, `"…"`).
2593 let Some(first @ Production::String { value: open }) = members.first() else {
2594 continue;
2595 };
2596 // Only a genuine *quote* delimiter (`'`, `"`, `` ` ``) opens a
2597 // string body. A bracket-like paired STRING (`|…|` block params,
2598 // `(…)`) is NOT a string: its inner symbols (`identifier`) are
2599 // structural children, not verbatim content, and must not be
2600 // emitted tight. (The old direct-`REPEAT` body guard happened to
2601 // exclude `|…|`; once the unbounded-body check follows a symbol
2602 // hop, the quote guard is what keeps the classifier sound.)
2603 if !is_quote_delimiter(first) {
2604 continue;
2605 }
2606 // The closer is the *last member whose (possibly wrapped) STRING
2607 // equals the opener*; a trailing suffix may follow (C#'s
2608 // `string_literal` ends with an optional `(u|U)8` encoding CHOICE
2609 // after the close quote). The close may be wrapped in an
2610 // `IMMEDIATE_TOKEN` (the scanner-tight close common to many string
2611 // rules, e.g. query `SEQ["\"", body, IMMEDIATE_TOKEN("\"")]`), so
2612 // unwrap before comparing.
2613 let Some(close_idx) = members
2614 .iter()
2615 .rposition(|m| unwrap_to_string(m) == Some(open.as_str()))
2616 else {
2617 continue;
2618 };
2619 if close_idx == 0 {
2620 continue;
2621 }
2622 // The body must be unbounded (the open-ended string body),
2623 // distinguishing a quoted string from a fixed two-quote token. A
2624 // body member is unbounded when it is itself a REPEAT/REPEAT1 or
2625 // when it references a named symbol whose own rule is an unbounded
2626 // content rule (query's `string` body is `CHOICE[string_content |
2627 // BLANK]` where `string_content = REPEAT1(...)`). Collect the named
2628 // content kinds it can yield, then commit only if the body confirmed
2629 // the string shape.
2630 let mut has_repeat_body = false;
2631 for member in &members[1..close_idx] {
2632 if member_is_unbounded_body(member, &grammar.rules) {
2633 has_repeat_body = true;
2634 }
2635 collect_string_content_kinds(member, &mut accum);
2636 }
2637 if has_repeat_body {
2638 accum.commit();
2639 } else {
2640 accum.clear_in_rule_guard();
2641 }
2642 }
2643 }
2644 grammar.string_content_kinds = accum.into_set();
2645}
2646
2647/// Accumulator that only retains content kinds collected from a body that
2648/// actually had a `REPEAT` (an unbounded string body). A small helper so
2649/// `classify_string_content_kinds` can discard a false match without
2650/// losing kinds collected from other rules.
2651struct StringContentAccum {
2652 confirmed: std::collections::HashSet<String>,
2653 pending: std::collections::HashSet<String>,
2654}
2655
2656impl StringContentAccum {
2657 fn new() -> Self {
2658 Self {
2659 confirmed: std::collections::HashSet::new(),
2660 pending: std::collections::HashSet::new(),
2661 }
2662 }
2663 fn insert(&mut self, kind: String) {
2664 self.pending.insert(kind);
2665 }
2666 fn commit(&mut self) {
2667 for k in self.pending.drain() {
2668 self.confirmed.insert(k);
2669 }
2670 }
2671 fn clear_in_rule_guard(&mut self) {
2672 self.pending.clear();
2673 }
2674 fn into_set(mut self) -> std::collections::HashSet<String> {
2675 self.commit();
2676 self.confirmed
2677 }
2678}
2679
2680/// Recursively collect the named content kinds reachable inside a string
2681/// body: named `ALIAS` values (`string_content` over a PATTERN) and named
2682/// `SYMBOL` references (`escape_sequence`). Hidden (`_`-prefixed) symbols
2683/// are not vertices, so they are skipped; concrete sub-rules are not
2684/// recursed into (they are their own vertices with their own layout).
2685fn collect_string_content_kinds(prod: &Production, out: &mut StringContentAccum) {
2686 match prod {
2687 Production::Alias {
2688 value, named: true, ..
2689 } => out.insert(value.clone()),
2690 Production::Symbol { name } if !name.starts_with('_') => out.insert(name.clone()),
2691 Production::Choice { members } | Production::Seq { members } => {
2692 for m in members {
2693 collect_string_content_kinds(m, out);
2694 }
2695 }
2696 Production::Repeat { content }
2697 | Production::Repeat1 { content }
2698 | Production::Optional { content }
2699 | Production::Token { content }
2700 | Production::ImmediateToken { content }
2701 | Production::Prec { content, .. }
2702 | Production::PrecLeft { content, .. }
2703 | Production::PrecRight { content, .. }
2704 | Production::PrecDynamic { content, .. }
2705 | Production::Reserved { content, .. }
2706 | Production::Field { content, .. } => collect_string_content_kinds(content, out),
2707 _ => {}
2708 }
2709}
2710
2711/// Identify indented-block rules whose opening `_indent` is supplied by
2712/// a hidden parent: the rule's body references an external indent-close
2713/// token (`_dedent`) but no indent-open token. Python's `block = SEQ[
2714/// REPEAT(_statement), _dedent]` is the canonical case — the matching
2715/// `_indent` sits in the hidden `_suite` wrapper, which is not a vertex,
2716/// so the parser hands the emitter a bare `block` and the opening indent
2717/// must be synthesized.
2718pub(crate) fn classify_synthetic_indent_rules(grammar: &mut Grammar) {
2719 if grammar.external_indent_closes.is_empty() {
2720 return;
2721 }
2722 let mut rules = std::collections::HashSet::new();
2723 for (name, rule) in &grammar.rules {
2724 let symbols = referenced_symbols(rule);
2725 let references_close = symbols
2726 .iter()
2727 .any(|s| grammar.external_indent_closes.contains(*s));
2728 let references_open = symbols
2729 .iter()
2730 .any(|s| grammar.external_indent_opens.contains(*s));
2731 if references_close && !references_open {
2732 rules.insert(name.clone());
2733 }
2734 }
2735 grammar.synthetic_indent_rules = rules;
2736}
2737
2738/// Collect named terminal kinds whose underlying `PATTERN` can match a
2739/// leading space (see `pattern_absorbs_leading_space`). Two shapes
2740/// produce such a kind on the schema:
2741///
2742/// - `ALIAS { content: PATTERN p, named: true, value: K }` — the pattern is
2743/// renamed to kind `K` (INI's `setting_value`, `setting_name`, …).
2744/// - a named rule `K = PATTERN p` — the rule itself is a bare terminal.
2745///
2746/// In both cases the captured text round-trips through `K`, so a layout
2747/// space emitted before it would be absorbed and grow. The pattern is read
2748/// through token/precedence wrappers via [`terminal_pattern_of`].
2749pub(crate) fn classify_leading_space_terminals(
2750 grammar: &Grammar,
2751) -> std::collections::HashSet<String> {
2752 let mut out = std::collections::HashSet::new();
2753
2754 // Named rules that are themselves a bare terminal pattern.
2755 for (name, rule) in &grammar.rules {
2756 if let Some(p) = terminal_pattern_of(rule) {
2757 if pattern_absorbs_leading_space(p) {
2758 out.insert(name.clone());
2759 }
2760 }
2761 }
2762
2763 // Named aliases wrapping a terminal pattern.
2764 fn walk(prod: &Production, out: &mut std::collections::HashSet<String>) {
2765 match prod {
2766 Production::Alias {
2767 content,
2768 named: true,
2769 value,
2770 } => {
2771 if let Some(p) = terminal_pattern_of(content) {
2772 if pattern_absorbs_leading_space(p) {
2773 out.insert(value.clone());
2774 }
2775 }
2776 walk(content, out);
2777 }
2778 Production::Alias { content, .. }
2779 | Production::Repeat { content }
2780 | Production::Repeat1 { content }
2781 | Production::Optional { content }
2782 | Production::Field { content, .. }
2783 | Production::Token { content }
2784 | Production::ImmediateToken { content }
2785 | Production::Prec { content, .. }
2786 | Production::PrecLeft { content, .. }
2787 | Production::PrecRight { content, .. }
2788 | Production::PrecDynamic { content, .. }
2789 | Production::Reserved { content, .. } => walk(content, out),
2790 Production::Seq { members } | Production::Choice { members } => {
2791 for m in members {
2792 walk(m, out);
2793 }
2794 }
2795 _ => {}
2796 }
2797 }
2798 for rule in grammar.rules.values() {
2799 walk(rule, &mut out);
2800 }
2801 out
2802}
2803
2804/// Classify named alias values whose ALIAS content is an `IMMEDIATE_TOKEN`
2805/// wrapping a bare `PATTERN` content fragment AND which appears as the
2806/// quote-pair-delimited *body* of a string/character literal (C
2807/// `char_literal`'s `character` = `[^\n']`, `string_literal`'s
2808/// `string_content` = `[^\\"\n]+`). Such a token is lexed only with no
2809/// preceding whitespace; since the alias value has no grammar rule, the
2810/// rule-head `IMMEDIATE_TOKEN` no-space check never fires, so the emitter
2811/// records these kinds to emit them tight (`'hey'` stays tight rather than
2812/// re-spacing to `' h e y'`, whose spaces re-parse as extra `character`s).
2813///
2814/// Two structural conditions, BOTH required:
2815///
2816/// 1. The immediate-token content reduces to a bare `PATTERN` (a character-
2817/// class content fragment), NOT a `STRING` / `CHOICE`-of-strings. Julia
2818/// spells a context-sensitive keyword-as-identifier as
2819/// `ALIAS{IMMEDIATE_TOKEN CHOICE[STRING "module", …], value:"identifier"}`,
2820/// a word-like token that must keep its surrounding whitespace
2821/// (`function foo`, not `functionfoo`). `terminal_pattern_of` returns
2822/// `Some` only for the bare-PATTERN shape, so it draws exactly this line.
2823///
2824/// 2. The alias is a member (through repeat/optional/choice wrappers) of a
2825/// `SEQ` whose first and last members are *quote delimiters* (a `STRING`
2826/// or `CHOICE`-of-`STRING`s ending in `'`/`"`/`` ` ``): i.e. it is the
2827/// delimited body of a quote pair. This excludes content fragments that
2828/// are numeric brace-range / bracket bodies whose `IMMEDIATE_TOKEN` is a
2829/// scanner immediacy fact LOCAL to that one construct, not a property of
2830/// the kind everywhere: bash `brace_expression = SEQ["{", number, "..",
2831/// number, "}"]` aliases `\d+` to `number`, but the very same `number`
2832/// kind is also a freestanding command argument (`exit 1`, `echo 1`)
2833/// where it must keep its leading space. The `{`/`}` brackets are not
2834/// quote delimiters, so `number` is not collected, and the argument
2835/// `number` is left spaced. Without this guard, db9b280 tightened every
2836/// `number` and regressed bash (`exit 1` → `exit1`).
2837pub(crate) fn classify_immediate_token_alias_kinds(
2838 grammar: &Grammar,
2839) -> std::collections::HashSet<String> {
2840 let mut out = std::collections::HashSet::new();
2841 // Collect every immediate-token bare-PATTERN alias reachable inside a
2842 // production (used for the interior body of a detected quote pair).
2843 fn collect_aliases(prod: &Production, out: &mut std::collections::HashSet<String>) {
2844 match prod {
2845 Production::Alias {
2846 content,
2847 named: true,
2848 value,
2849 } => {
2850 if is_immediate_token(content) && terminal_pattern_of(content).is_some() {
2851 out.insert(value.clone());
2852 }
2853 collect_aliases(content, out);
2854 }
2855 Production::Alias { content, .. }
2856 | Production::Repeat { content }
2857 | Production::Repeat1 { content }
2858 | Production::Optional { content }
2859 | Production::Field { content, .. }
2860 | Production::Token { content }
2861 | Production::ImmediateToken { content }
2862 | Production::Prec { content, .. }
2863 | Production::PrecLeft { content, .. }
2864 | Production::PrecRight { content, .. }
2865 | Production::PrecDynamic { content, .. }
2866 | Production::Reserved { content, .. } => collect_aliases(content, out),
2867 Production::Seq { members } | Production::Choice { members } => {
2868 for m in members {
2869 collect_aliases(m, out);
2870 }
2871 }
2872 _ => {}
2873 }
2874 }
2875 // Walk every production; whenever a `SEQ` is bracketed by quote
2876 // delimiters (a string/char literal), harvest the immediate-token
2877 // aliases from its interior body.
2878 fn walk(prod: &Production, out: &mut std::collections::HashSet<String>) {
2879 match prod {
2880 Production::Seq { members } => {
2881 if members.len() >= 2
2882 && is_quote_delimiter(&members[0])
2883 && is_quote_delimiter(&members[members.len() - 1])
2884 {
2885 for m in &members[1..members.len() - 1] {
2886 collect_aliases(m, out);
2887 }
2888 }
2889 for m in members {
2890 walk(m, out);
2891 }
2892 }
2893 Production::Choice { members } => {
2894 for m in members {
2895 walk(m, out);
2896 }
2897 }
2898 Production::Alias { content, .. }
2899 | Production::Repeat { content }
2900 | Production::Repeat1 { content }
2901 | Production::Optional { content }
2902 | Production::Field { content, .. }
2903 | Production::Token { content }
2904 | Production::ImmediateToken { content }
2905 | Production::Prec { content, .. }
2906 | Production::PrecLeft { content, .. }
2907 | Production::PrecRight { content, .. }
2908 | Production::PrecDynamic { content, .. }
2909 | Production::Reserved { content, .. } => walk(content, out),
2910 _ => {}
2911 }
2912 }
2913 for rule in grammar.rules.values() {
2914 walk(rule, &mut out);
2915 }
2916 out
2917}
2918
2919/// Classify named terminal kinds whose underlying `PATTERN` runs to the end
2920/// of the line (`hash_bang_line = #!.*`, `shebang = #!...`). These are
2921/// rest-of-line terminals: the layout pass emits a newline after them so the
2922/// next sibling does not merge into the token on re-parse. The generalised
2923/// counterpart of `line_comment_prefixes` (a line comment is a STRING prefix
2924/// then a rest-of-line PATTERN; here the whole token is the pattern).
2925pub(crate) fn classify_line_rest_kinds(grammar: &Grammar) -> std::collections::HashSet<String> {
2926 let mut out = std::collections::HashSet::new();
2927
2928 // Named rules that are themselves a bare rest-of-line pattern.
2929 for (name, rule) in &grammar.rules {
2930 if let Some(p) = terminal_pattern_of(rule) {
2931 if is_rest_of_line_pattern(p) {
2932 out.insert(name.clone());
2933 }
2934 }
2935 }
2936
2937 // Named aliases wrapping a rest-of-line pattern.
2938 fn walk(prod: &Production, out: &mut std::collections::HashSet<String>) {
2939 match prod {
2940 Production::Alias {
2941 content,
2942 named: true,
2943 value,
2944 } => {
2945 if let Some(p) = terminal_pattern_of(content) {
2946 if is_rest_of_line_pattern(p) {
2947 out.insert(value.clone());
2948 }
2949 }
2950 walk(content, out);
2951 }
2952 Production::Alias { content, .. }
2953 | Production::Repeat { content }
2954 | Production::Repeat1 { content }
2955 | Production::Optional { content }
2956 | Production::Field { content, .. }
2957 | Production::Token { content }
2958 | Production::ImmediateToken { content }
2959 | Production::Prec { content, .. }
2960 | Production::PrecLeft { content, .. }
2961 | Production::PrecRight { content, .. }
2962 | Production::PrecDynamic { content, .. }
2963 | Production::Reserved { content, .. } => walk(content, out),
2964 Production::Seq { members } | Production::Choice { members } => {
2965 for m in members {
2966 walk(m, out);
2967 }
2968 }
2969 _ => {}
2970 }
2971 }
2972 for rule in grammar.rules.values() {
2973 walk(rule, &mut out);
2974 }
2975 out
2976}