Skip to main content

panproto_parse/
emit_pretty.rs

1#![allow(
2    clippy::module_name_repetitions,
3    clippy::too_many_lines,
4    clippy::too_many_arguments,
5    clippy::map_unwrap_or,
6    clippy::option_if_let_else,
7    clippy::elidable_lifetime_names,
8    clippy::items_after_statements,
9    clippy::needless_pass_by_value,
10    clippy::single_match_else,
11    clippy::manual_let_else,
12    clippy::match_same_arms,
13    clippy::missing_const_for_fn,
14    clippy::single_char_pattern,
15    clippy::naive_bytecount,
16    clippy::expect_used,
17    clippy::redundant_pub_crate,
18    clippy::used_underscore_binding,
19    clippy::redundant_field_names,
20    clippy::struct_field_names,
21    clippy::redundant_else,
22    clippy::similar_names
23)]
24
25//! De-novo source emission from a by-construction schema.
26//!
27//! [`AstParser::emit`] reconstructs source from byte-position fragments
28//! that the parser stored on the schema during `parse`. That works for
29//! edit pipelines (`parse → transform → emit`) but fails for schemas
30//! built by hand (`SchemaBuilder` with no parse history): they carry
31//! no `start-byte`, no `interstitial-N`, no `literal-value`, and the
32//! reconstructor returns `Err(EmitFailed { reason: "schema has no
33//! text fragments" })`.
34//!
35//! This module renders such schemas to source bytes by walking
36//! tree-sitter's `grammar.json` production rules. For each schema
37//! vertex of kind `K`, the walker looks up `K`'s production in the
38//! grammar and emits its body in order:
39//!
40//! - `STRING` nodes contribute literal token bytes directly.
41//! - `SYMBOL` and `FIELD` nodes recurse into the schema's children,
42//!   matching by edge kind (which is the tree-sitter field name).
43//! - `SEQ` emits its members in order.
44//! - `CHOICE` picks the alternative whose head `SYMBOL` matches an
45//!   actual child kind, or whose terminals appear in the rendered
46//!   prefix; falls back to the first non-`BLANK` alternative when no
47//!   alternative matches.
48//! - `REPEAT` and `REPEAT1` emit their content once per matching
49//!   child edge in declared order.
50//! - `OPTIONAL` emits its content iff a corresponding child edge or
51//!   constraint is populated.
52//! - `PATTERN` is a regex placeholder for variable-text terminals
53//!   (identifiers, numbers, quoted strings). The walker emits a
54//!   `literal-value` constraint when present and otherwise falls
55//!   back to a placeholder derived from the regex shape.
56//! - `BLANK`, `TOKEN`, `IMMEDIATE_TOKEN`, `ALIAS`, `PREC*` are
57//!   handled transparently (the inner content is emitted; the
58//!   wrapper is dropped).
59//!
60//! Whitespace and indentation come from a `FormatPolicy` applied
61//! during emission. The default policy inserts a single space between
62//! adjacent tokens, a newline after `;` / `}` / `{`, and tracks an
63//! indent counter on `{` / `}` boundaries.
64//!
65//! Output is *syntactically valid* for any grammar that ships
66//! `grammar.json`. Idiomatic formatting (rustfmt-style spacing rules,
67//! per-language conventions) is a polish layer that lives outside
68//! this module.
69
70use std::collections::BTreeMap;
71
72use panproto_schema::{Edge, Schema};
73use serde::Deserialize;
74
75use crate::error::ParseError;
76
77// ═══════════════════════════════════════════════════════════════════
78// Grammar JSON model
79// ═══════════════════════════════════════════════════════════════════
80
81/// A single tree-sitter production rule.
82///
83/// Mirrors the shape emitted by `tree-sitter generate`: every node has
84/// a `type` discriminator that selects a structural variant. The
85/// untyped subset (`PATTERN`, `STRING`, `SYMBOL`, `BLANK`) handles
86/// terminals; the structural subset (`SEQ`, `CHOICE`, `REPEAT`,
87/// `REPEAT1`, `OPTIONAL`, `FIELD`, `ALIAS`, `TOKEN`,
88/// `IMMEDIATE_TOKEN`, `PREC*`) builds composite productions.
89#[derive(Debug, Clone, Deserialize)]
90#[serde(tag = "type")]
91#[non_exhaustive]
92pub enum Production {
93    /// Concatenation of productions.
94    #[serde(rename = "SEQ")]
95    Seq {
96        /// Ordered members; each is emitted in turn.
97        members: Vec<Self>,
98    },
99    /// Alternation between productions.
100    #[serde(rename = "CHOICE")]
101    Choice {
102        /// Alternatives; the walker picks one based on the schema's
103        /// children and constraints.
104        members: Vec<Self>,
105    },
106    /// Zero-or-more repetition.
107    #[serde(rename = "REPEAT")]
108    Repeat {
109        /// The repeated body.
110        content: Box<Self>,
111    },
112    /// One-or-more repetition.
113    #[serde(rename = "REPEAT1")]
114    Repeat1 {
115        /// The repeated body.
116        content: Box<Self>,
117    },
118    /// Optional inclusion (zero or one).
119    ///
120    /// Tree-sitter usually emits `OPTIONAL` as `CHOICE { content,
121    /// BLANK }`, but recent generator versions also emit explicit
122    /// `OPTIONAL` nodes; both shapes are accepted.
123    #[serde(rename = "OPTIONAL")]
124    Optional {
125        /// The optional body.
126        content: Box<Self>,
127    },
128    /// Reference to another rule by name.
129    #[serde(rename = "SYMBOL")]
130    Symbol {
131        /// Name of the referenced rule (matches a vertex kind on the
132        /// schema side).
133        name: String,
134    },
135    /// Literal token bytes.
136    #[serde(rename = "STRING")]
137    String {
138        /// The literal token. Emitted verbatim.
139        value: String,
140    },
141    /// Regex-matched terminal.
142    ///
143    /// At parse time this matches arbitrary bytes; at emit time the
144    /// walker substitutes a `literal-value` constraint when present
145    /// and falls back to a placeholder otherwise.
146    #[serde(rename = "PATTERN")]
147    Pattern {
148        /// The original regex.
149        value: String,
150    },
151    /// The empty production. Emits nothing.
152    #[serde(rename = "BLANK")]
153    Blank,
154    /// Named field over a content production.
155    ///
156    /// The field `name` matches an edge kind on the schema side; the
157    /// walker resolves the corresponding child vertex and recurses
158    /// into `content` with that child as context.
159    #[serde(rename = "FIELD")]
160    Field {
161        /// Field name (matches edge kind).
162        name: String,
163        /// The contents of the field.
164        content: Box<Self>,
165    },
166    /// An aliased production.
167    ///
168    /// `value` records the parser-visible kind; the walker emits
169    /// `content` and ignores the alias rename.
170    #[serde(rename = "ALIAS")]
171    Alias {
172        /// The aliased content.
173        content: Box<Self>,
174        /// Whether the alias is a named node.
175        #[serde(default)]
176        named: bool,
177        /// The alias's surface name.
178        #[serde(default)]
179        value: String,
180    },
181    /// A token wrapper.
182    ///
183    /// Tree-sitter uses `TOKEN` to mark a sub-rule as a single
184    /// lexical token; the walker emits the inner content unchanged.
185    #[serde(rename = "TOKEN")]
186    Token {
187        /// The wrapped content.
188        content: Box<Self>,
189    },
190    /// An immediate-token wrapper (no preceding whitespace).
191    ///
192    /// Treated like [`Production::Token`] for emit purposes.
193    #[serde(rename = "IMMEDIATE_TOKEN")]
194    ImmediateToken {
195        /// The wrapped content.
196        content: Box<Self>,
197    },
198    /// Precedence wrapper.
199    #[serde(rename = "PREC")]
200    Prec {
201        /// Precedence value (numeric or string). Ignored at emit time.
202        #[allow(dead_code)]
203        value: serde_json::Value,
204        /// The wrapped content.
205        content: Box<Self>,
206    },
207    /// Left-associative precedence wrapper.
208    #[serde(rename = "PREC_LEFT")]
209    PrecLeft {
210        /// Precedence value. Ignored at emit time.
211        #[allow(dead_code)]
212        value: serde_json::Value,
213        /// The wrapped content.
214        content: Box<Self>,
215    },
216    /// Right-associative precedence wrapper.
217    #[serde(rename = "PREC_RIGHT")]
218    PrecRight {
219        /// Precedence value. Ignored at emit time.
220        #[allow(dead_code)]
221        value: serde_json::Value,
222        /// The wrapped content.
223        content: Box<Self>,
224    },
225    /// Dynamic precedence wrapper.
226    #[serde(rename = "PREC_DYNAMIC")]
227    PrecDynamic {
228        /// Precedence value. Ignored at emit time.
229        #[allow(dead_code)]
230        value: serde_json::Value,
231        /// The wrapped content.
232        content: Box<Self>,
233    },
234    /// Reserved-word wrapper (tree-sitter ≥ 0.25).
235    ///
236    /// Tree-sitter's `RESERVED` rule marks an inner production as a
237    /// reserved-word context: the parser excludes the listed identifiers
238    /// from being treated as the inner symbol. The `context_name`
239    /// metadata names the reserved-word set; the emitter does not need
240    /// it (we are walking schema → bytes, not enforcing reserved-word
241    /// constraints), so we emit the inner content unchanged, the same
242    /// way [`Production::Token`] and [`Production::ImmediateToken`] do.
243    #[serde(rename = "RESERVED")]
244    Reserved {
245        /// The wrapped content.
246        content: Box<Self>,
247        /// Name of the reserved-word context. Ignored at emit time.
248        #[allow(dead_code)]
249        #[serde(default)]
250        context_name: String,
251    },
252}
253
254/// A grammar's production-rule table, deserialized from `grammar.json`.
255///
256/// Only the fields the emitter consumes are decoded; precedences,
257/// conflicts, externals, and other parser-only metadata are ignored.
258#[derive(Debug, Clone, Deserialize)]
259#[non_exhaustive]
260pub struct Grammar {
261    /// Grammar name (e.g. `"rust"`, `"typescript"`).
262    #[allow(dead_code)]
263    pub name: String,
264    /// Map from rule name (a vertex kind on the schema side) to
265    /// production. Entries are kept in lexical order so iteration
266    /// is deterministic.
267    pub rules: BTreeMap<String, Production>,
268    /// Supertypes declared in the grammar's `supertypes` field. A
269    /// supertype is a rule whose body is a `CHOICE` of `SYMBOL`
270    /// references; tree-sitter parsers report a node's kind as one
271    /// of the subtypes (e.g. `identifier`, `typed_parameter`) rather
272    /// than the supertype name (`parameter`), so the emitter needs to
273    /// know that a child kind in a subtype set should match the
274    /// supertype name when a SYMBOL references it.
275    #[serde(default, deserialize_with = "deserialize_supertypes")]
276    pub supertypes: std::collections::HashSet<String>,
277    /// Tree-sitter `extras` rules: the named symbols (typically comments)
278    /// that tree-sitter skips at parse time but records as children of the
279    /// surrounding vertex. They appear nowhere in the production grammar,
280    /// so the rule walker cannot reconcile them against the cursor — the
281    /// emit pass therefore drains them as a side channel: at vertex entry
282    /// and between REPEAT iterations any leading extras-kind edges are
283    /// consumed and emitted directly. The set is populated at
284    /// `Grammar::from_bytes` by collecting every `SYMBOL { name }` and
285    /// named `ALIAS { value, named: true }` under the top-level `extras`
286    /// array. Pattern-only extras (e.g. `\s` whitespace) are not vertex
287    /// kinds and are excluded.
288    #[serde(default, deserialize_with = "deserialize_extras")]
289    pub extras: std::collections::HashSet<String>,
290    /// Precomputed subtyping closure: `subtypes[symbol_name]` is the
291    /// set of vertex kinds that satisfy a SYMBOL `symbol_name`
292    /// reference on the schema side.
293    ///
294    /// Built once at [`Grammar::from_bytes`] time by walking each
295    /// hidden rule (`_`-prefixed), declared supertype, and named
296    /// `ALIAS { value: K, ... }` production to its leaf SYMBOLs and
297    /// recording the closure. This replaces the prior heuristic
298    /// `kind_satisfies_symbol` that walked the rule body on every
299    /// query: lookups are now O(1) and the relation is exactly the
300    /// transitive closure of "is reachable via hidden / supertype /
301    /// alias dispatch", with no over-expansion through non-hidden
302    /// non-supertype rule references.
303    #[serde(skip)]
304    pub subtypes: std::collections::HashMap<String, std::collections::HashSet<String>>,
305}
306
307fn deserialize_supertypes<'de, D>(
308    deserializer: D,
309) -> Result<std::collections::HashSet<String>, D::Error>
310where
311    D: serde::Deserializer<'de>,
312{
313    let entries: Vec<serde_json::Value> = Vec::deserialize(deserializer)?;
314    let mut out = std::collections::HashSet::new();
315    for entry in entries {
316        match entry {
317            serde_json::Value::String(s) => {
318                out.insert(s);
319            }
320            serde_json::Value::Object(map) => {
321                if let Some(serde_json::Value::String(name)) = map.get("name") {
322                    out.insert(name.clone());
323                }
324            }
325            _ => {}
326        }
327    }
328    Ok(out)
329}
330
331fn deserialize_extras<'de, D>(
332    deserializer: D,
333) -> Result<std::collections::HashSet<String>, D::Error>
334where
335    D: serde::Deserializer<'de>,
336{
337    let entries: Vec<serde_json::Value> = Vec::deserialize(deserializer)?;
338    let mut out = std::collections::HashSet::new();
339    for entry in entries {
340        if let serde_json::Value::Object(map) = entry {
341            let ty = map.get("type").and_then(serde_json::Value::as_str);
342            match ty {
343                // SYMBOL { name: K } — the extras rule is a named symbol
344                // (typically `line_comment` / `block_comment`). The kind
345                // K appears as a real child vertex on the schema side.
346                Some("SYMBOL") => {
347                    if let Some(serde_json::Value::String(name)) = map.get("name") {
348                        out.insert(name.clone());
349                    }
350                }
351                // ALIAS { content, value: V, named: true } — the extras
352                // rule renames its content; V is the kind on the schema.
353                Some("ALIAS") => {
354                    let named = map
355                        .get("named")
356                        .and_then(serde_json::Value::as_bool)
357                        .unwrap_or(false);
358                    if named {
359                        if let Some(serde_json::Value::String(value)) = map.get("value") {
360                            out.insert(value.clone());
361                        }
362                    }
363                }
364                // PATTERN / STRING / TOKEN entries describe inter-token
365                // whitespace and have no vertex-side representation.
366                _ => {}
367            }
368        }
369    }
370    Ok(out)
371}
372
373impl Grammar {
374    /// Parse a grammar's `grammar.json` bytes.
375    ///
376    /// Builds the subtyping closure as part of construction so every
377    /// downstream lookup is O(1). The closure is the least relation
378    /// containing `(K, K)` for every rule key `K` and closed under:
379    ///
380    /// - hidden-rule expansion: if `S` is hidden and a SYMBOL `S` may
381    ///   reach SYMBOL `K`, then `K ⊑ S`.
382    /// - supertype expansion: if `S` is in the grammar's supertypes
383    ///   block and `K` is one of `S`'s alternatives, then `K ⊑ S`.
384    /// - alias renaming: if a rule body contains
385    ///   `ALIAS { content: SYMBOL R, value: A, named: true }` where
386    ///   `R` reaches kind `K` (or `K = R` when no further hop), then
387    ///   `A ⊑ R` and `K ⊑ A`.
388    ///
389    /// # Errors
390    ///
391    /// Returns [`ParseError::EmitFailed`] when the bytes are not a
392    /// valid `grammar.json` document.
393    pub fn from_bytes(protocol: &str, bytes: &[u8]) -> Result<Self, ParseError> {
394        let mut grammar: Self =
395            serde_json::from_slice(bytes).map_err(|e| ParseError::EmitFailed {
396                protocol: protocol.to_owned(),
397                reason: format!("grammar.json deserialization failed: {e}"),
398            })?;
399        grammar.subtypes = compute_subtype_closure(&grammar);
400        Ok(grammar)
401    }
402}
403
404/// Compute the subtyping relation as a forward-indexed map from a
405/// SYMBOL name to the set of vertex kinds that satisfy that SYMBOL.
406fn compute_subtype_closure(
407    grammar: &Grammar,
408) -> std::collections::HashMap<String, std::collections::HashSet<String>> {
409    use std::collections::{HashMap, HashSet};
410    // Edges of the "kind X satisfies SYMBOL Y" relation. `K ⊑ Y` is
411    // recorded whenever Y is reached by walking the grammar's
412    // ALIAS / hidden-rule / supertype dispatch from a position where
413    // K is the actual vertex kind.
414    let mut subtypes: HashMap<String, HashSet<String>> = HashMap::new();
415    for name in grammar.rules.keys() {
416        subtypes
417            .entry(name.clone())
418            .or_default()
419            .insert(name.clone());
420    }
421
422    // First pass: collect the immediate "satisfies" edges from each
423    // expandable rule (hidden, supertype) to the kinds reachable by
424    // walking its body, plus alias edges.
425    fn walk<'g>(
426        grammar: &'g Grammar,
427        production: &'g Production,
428        visited: &mut HashSet<&'g str>,
429        out: &mut HashSet<String>,
430    ) {
431        match production {
432            Production::Symbol { name } => {
433                // Direct subtype.
434                out.insert(name.clone());
435                // Continue expansion through hidden / supertype rules
436                // so the closure traverses pass-through dispatch.
437                let expand = name.starts_with('_') || grammar.supertypes.contains(name.as_str());
438                if expand && visited.insert(name.as_str()) {
439                    if let Some(rule) = grammar.rules.get(name) {
440                        walk(grammar, rule, visited, out);
441                    }
442                }
443            }
444            Production::Choice { members } | Production::Seq { members } => {
445                for m in members {
446                    walk(grammar, m, visited, out);
447                }
448            }
449            Production::Alias {
450                content,
451                named,
452                value,
453            } => {
454                if *named && !value.is_empty() {
455                    out.insert(value.clone());
456                }
457                walk(grammar, content, visited, out);
458            }
459            Production::Repeat { content }
460            | Production::Repeat1 { content }
461            | Production::Optional { content }
462            | Production::Field { content, .. }
463            | Production::Token { content }
464            | Production::ImmediateToken { content }
465            | Production::Prec { content, .. }
466            | Production::PrecLeft { content, .. }
467            | Production::PrecRight { content, .. }
468            | Production::PrecDynamic { content, .. }
469            | Production::Reserved { content, .. } => {
470                walk(grammar, content, visited, out);
471            }
472            _ => {}
473        }
474    }
475
476    for (name, rule) in &grammar.rules {
477        let expand = name.starts_with('_') || grammar.supertypes.contains(name.as_str());
478        if !expand {
479            continue;
480        }
481        let mut visited: HashSet<&str> = HashSet::new();
482        visited.insert(name.as_str());
483        let mut reachable: HashSet<String> = HashSet::new();
484        walk(grammar, rule, &mut visited, &mut reachable);
485        for kind in &reachable {
486            subtypes
487                .entry(kind.clone())
488                .or_default()
489                .insert(name.clone());
490        }
491    }
492
493    // Aliases: scan every rule body for ALIAS { content, value }
494    // declarations. The kinds reachable from `content` satisfy
495    // `value`, AND (by construction) `value` satisfies the
496    // surrounding rule. Walking the ENTIRE grammar once captures
497    // every alias site, irrespective of which rule introduces it.
498    fn collect_aliases<'g>(production: &'g Production, out: &mut Vec<(String, &'g Production)>) {
499        match production {
500            Production::Alias {
501                content,
502                named,
503                value,
504            } => {
505                if *named && !value.is_empty() {
506                    out.push((value.clone(), content.as_ref()));
507                }
508                collect_aliases(content, out);
509            }
510            Production::Choice { members } | Production::Seq { members } => {
511                for m in members {
512                    collect_aliases(m, out);
513                }
514            }
515            Production::Repeat { content }
516            | Production::Repeat1 { content }
517            | Production::Optional { content }
518            | Production::Field { content, .. }
519            | Production::Token { content }
520            | Production::ImmediateToken { content }
521            | Production::Prec { content, .. }
522            | Production::PrecLeft { content, .. }
523            | Production::PrecRight { content, .. }
524            | Production::PrecDynamic { content, .. }
525            | Production::Reserved { content, .. } => {
526                collect_aliases(content, out);
527            }
528            _ => {}
529        }
530    }
531    let mut aliases: Vec<(String, &Production)> = Vec::new();
532    for rule in grammar.rules.values() {
533        collect_aliases(rule, &mut aliases);
534    }
535    for (alias_value, content) in aliases {
536        let mut visited: HashSet<&str> = HashSet::new();
537        let mut reachable: HashSet<String> = HashSet::new();
538        walk(grammar, content, &mut visited, &mut reachable);
539        // Aliased value satisfies itself and is satisfied by every
540        // kind its content can reach.
541        subtypes
542            .entry(alias_value.clone())
543            .or_default()
544            .insert(alias_value.clone());
545        for kind in reachable {
546            subtypes
547                .entry(kind)
548                .or_default()
549                .insert(alias_value.clone());
550        }
551    }
552
553    // Transitive close: `K ⊑ A` and `A ⊑ B` implies `K ⊑ B`. Iterate
554    // a few rounds; the relation is small so a quick fixed-point
555    // suffices in practice.
556    for _ in 0..8 {
557        let snapshot = subtypes.clone();
558        let mut changed = false;
559        for (kind, supers) in &snapshot {
560            let extra: HashSet<String> = supers
561                .iter()
562                .flat_map(|s| snapshot.get(s).cloned().unwrap_or_default())
563                .collect();
564            let entry = subtypes.entry(kind.clone()).or_default();
565            for s in extra {
566                if entry.insert(s) {
567                    changed = true;
568                }
569            }
570        }
571        if !changed {
572            break;
573        }
574    }
575
576    subtypes
577}
578
579// ═══════════════════════════════════════════════════════════════════
580// Format policy
581// ═══════════════════════════════════════════════════════════════════
582
583/// Whitespace and indentation policy applied during emission.
584///
585/// The default policy inserts a single space between adjacent tokens,
586/// a newline after `;` / `}` / `{`, and tracks indent on `{` / `}`
587/// boundaries. Per-language overrides (idiomatic indent width,
588/// trailing-comma rules, blank-line conventions) can ride alongside
589/// this struct in a follow-up branch; today's defaults aim only for
590/// syntactic validity.
591#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
592pub struct FormatPolicy {
593    /// Number of spaces per indent level.
594    pub indent_width: usize,
595    /// Separator inserted between adjacent terminals that the lexer
596    /// would otherwise glue together (word ↔ word, operator ↔ operator).
597    /// Default is a single space.
598    pub separator: String,
599    /// Newline byte sequence emitted after `line_break_after` tokens
600    /// and at end-of-output. Default is `"\n"`.
601    pub newline: String,
602    /// Tokens after which the walker breaks to a new line.
603    pub line_break_after: Vec<String>,
604    /// Tokens that increase indent on emission.
605    pub indent_open: Vec<String>,
606    /// Tokens that decrease indent on emission.
607    pub indent_close: Vec<String>,
608}
609
610impl Default for FormatPolicy {
611    fn default() -> Self {
612        Self {
613            indent_width: 2,
614            separator: " ".to_owned(),
615            newline: "\n".to_owned(),
616            line_break_after: vec![";".into(), "{".into(), "}".into()],
617            indent_open: vec!["{".into()],
618            indent_close: vec!["}".into()],
619        }
620    }
621}
622
623// ═══════════════════════════════════════════════════════════════════
624// Emitter
625// ═══════════════════════════════════════════════════════════════════
626
627/// Emit a by-construction schema to source bytes.
628///
629/// `protocol` is the grammar / language name (used in error messages
630/// and to label the entry point).
631///
632/// The walker treats `schema.entries` as the ordered list of root
633/// vertices, falling back to a deterministic by-id ordering when
634/// `entries` is empty. Each root is emitted using the production
635/// associated with its kind in `grammar.rules`.
636///
637/// # Errors
638///
639/// Returns [`ParseError::EmitFailed`] when:
640///
641/// - the schema has no vertices
642/// - a root vertex's kind is not a grammar rule
643/// - a `SYMBOL` reference points at a kind with no rule and no schema
644///   child to resolve it to
645/// - a required `FIELD` has no corresponding edge in the schema
646pub fn emit_pretty(
647    protocol: &str,
648    schema: &Schema,
649    grammar: &Grammar,
650    policy: &FormatPolicy,
651) -> Result<Vec<u8>, ParseError> {
652    let roots = collect_roots(schema);
653    if roots.is_empty() {
654        return Err(ParseError::EmitFailed {
655            protocol: protocol.to_owned(),
656            reason: "schema has no entry vertices".to_owned(),
657        });
658    }
659
660    let mut out = Output::new(policy);
661    for (i, root) in roots.iter().enumerate() {
662        if i > 0 {
663            out.newline();
664        }
665        emit_vertex(protocol, schema, grammar, root, &mut out)?;
666    }
667    Ok(out.finish())
668}
669
670fn collect_roots(schema: &Schema) -> Vec<&panproto_gat::Name> {
671    if !schema.entries.is_empty() {
672        return schema
673            .entries
674            .iter()
675            .filter(|name| schema.vertices.contains_key(*name))
676            .collect();
677    }
678
679    // Fallback: every vertex that is not the target of any structural edge
680    // (sorted by id for determinism).
681    let mut targets: std::collections::HashSet<&panproto_gat::Name> =
682        std::collections::HashSet::new();
683    for edge in schema.edges.keys() {
684        targets.insert(&edge.tgt);
685    }
686    let mut roots: Vec<&panproto_gat::Name> = schema
687        .vertices
688        .keys()
689        .filter(|name| !targets.contains(name))
690        .collect();
691    roots.sort();
692    roots
693}
694
695fn emit_vertex(
696    protocol: &str,
697    schema: &Schema,
698    grammar: &Grammar,
699    vertex_id: &panproto_gat::Name,
700    out: &mut Output<'_>,
701) -> Result<(), ParseError> {
702    let vertex = schema
703        .vertices
704        .get(vertex_id)
705        .ok_or_else(|| ParseError::EmitFailed {
706            protocol: protocol.to_owned(),
707            reason: format!("vertex '{vertex_id}' not found"),
708        })?;
709
710    // Leaf shortcut: a vertex carrying a `literal-value` constraint
711    // and no outgoing structural edges is a terminal token. Emit the
712    // captured value directly. This handles identifiers, numeric
713    // literals, and string literals that the parser stored as
714    // `literal-value` even on by-construction schemas.
715    if let Some(literal) = literal_value(schema, vertex_id) {
716        if children_for(schema, vertex_id).is_empty() {
717            out.token(literal);
718            return Ok(());
719        }
720    }
721
722    let kind = vertex.kind.as_ref();
723    let edges = children_for(schema, vertex_id);
724    if let Some(rule) = grammar.rules.get(kind) {
725        let mut cursor = ChildCursor::new(&edges);
726        emit_production(protocol, schema, grammar, vertex_id, rule, &mut cursor, out)?;
727        // Drain any extras left after the rule walk completed; tree-sitter
728        // may record trailing comments as children of the surrounding
729        // vertex (i.e. after the last structural child the rule matched).
730        drain_extras(protocol, schema, grammar, &mut cursor, out)?;
731        // Emit remaining unconsumed structural children that the grammar
732        // rule did not match. This covers cases where tree-sitter's parser
733        // produces child kinds not reachable from grammar.json's production
734        // rules (e.g. Julia's macrocall_expression can have an
735        // `argument_list` child even though the grammar only references
736        // `macro_argument_list`).
737        for (i, edge) in cursor.edges.iter().enumerate() {
738            if !cursor.consumed[i] {
739                let child_kind = schema.vertices.get(&edge.tgt).map(|v| v.kind.as_ref());
740                if child_kind.is_some_and(|k| !grammar.extras.contains(k)) {
741                    emit_vertex(protocol, schema, grammar, &edge.tgt, out)?;
742                }
743            }
744        }
745        return Ok(());
746    }
747
748    // No rule for this kind. The parser produced it via an ALIAS
749    // (tree-sitter's `alias($.something, $.actual_kind)`) or via an
750    // external scanner (e.g. YAML's `document` root). Fall back to
751    // walking the children directly so the inner content survives;
752    // surrounding tokens — whose only source is the missing rule —
753    // are necessarily absent.
754    for edge in &edges {
755        emit_vertex(protocol, schema, grammar, &edge.tgt, out)?;
756    }
757    Ok(())
758}
759
760/// Linear cursor over a vertex's outgoing edges, used to thread
761/// children through a production rule without double-consuming them.
762struct ChildCursor<'a> {
763    edges: &'a [&'a Edge],
764    consumed: Vec<bool>,
765}
766
767impl<'a> ChildCursor<'a> {
768    fn new(edges: &'a [&'a Edge]) -> Self {
769        Self {
770            edges,
771            consumed: vec![false; edges.len()],
772        }
773    }
774
775    /// Take the next unconsumed edge whose kind equals `field_name`.
776    fn take_field(&mut self, field_name: &str) -> Option<&'a Edge> {
777        for (i, edge) in self.edges.iter().enumerate() {
778            if !self.consumed[i] && edge.kind.as_ref() == field_name {
779                self.consumed[i] = true;
780                return Some(edge);
781            }
782        }
783        None
784    }
785
786    /// Whether any unconsumed edge satisfies `predicate`. Used by the
787    /// unit tests; the live emit path went through `has_matching` on
788    /// each alternative until cursor-driven dispatch was rewritten to
789    /// pick the first-unconsumed-edge's kind directly.
790    #[cfg(test)]
791    fn has_matching(&self, predicate: impl Fn(&Edge) -> bool) -> bool {
792        self.edges
793            .iter()
794            .enumerate()
795            .any(|(i, edge)| !self.consumed[i] && predicate(edge))
796    }
797
798    /// Take the next unconsumed edge whose target vertex satisfies
799    /// `predicate`. Returns the edge and the underlying production
800    /// resolution path is the caller's job.
801    fn take_matching(&mut self, predicate: impl Fn(&Edge) -> bool) -> Option<&'a Edge> {
802        for (i, edge) in self.edges.iter().enumerate() {
803            if !self.consumed[i] && predicate(edge) {
804                self.consumed[i] = true;
805                return Some(edge);
806            }
807        }
808        None
809    }
810}
811
812thread_local! {
813    static EMIT_DEPTH: std::cell::Cell<usize> = const { std::cell::Cell::new(0) };
814    /// Set of `(vertex_id, rule_name)` pairs that are currently being
815    /// walked by the recursion. A SYMBOL that resolves to a rule
816    /// already on this stack closes a μ-binder cycle: in the
817    /// coinductive reading, the rule walk at any vertex is the least
818    /// fixed point of `body[μ X . body / X]`, which unfolds at most
819    /// once, with the second visit returning the empty sequence (the
820    /// unit of the free token monoid). Examples that trigger this:
821    /// YAML's `stream` ⊃ `_b_blk_*` mutually-recursive chain, Rust's
822    /// `_expression` ⊃ `binary_expression` ⊃ `_expression`.
823    static EMIT_MU_FRAMES: std::cell::RefCell<std::collections::HashSet<(String, String)>> =
824        std::cell::RefCell::new(std::collections::HashSet::new());
825    /// The name of the FIELD whose body the walker is currently inside,
826    /// or `None` at top level. Lets a SYMBOL nested arbitrarily deep
827    /// in the field's content (under SEQ, CHOICE, REPEAT, OPTIONAL)
828    /// consume from the *outer* cursor by edge-kind rather than from
829    /// the child's own cursor by symbol-match. Without this, shapes
830    /// like `field('args', commaSep1($.X))` — which expands to
831    /// `FIELD(SEQ(SYMBOL X, REPEAT(SEQ(',', SYMBOL X))))` — emit only
832    /// the first matched edge: the FIELD handler consumed one edge,
833    /// the inner REPEAT searched the consumed child's cursor (which
834    /// has no more sibling field edges), and the REPEAT broke after
835    /// one iteration. Setting the context here so the inner SYMBOL
836    /// pulls successive field-named edges from the outer cursor
837    /// recovers every matched edge across arbitrary nesting.
838    static EMIT_FIELD_CONTEXT: std::cell::RefCell<Option<String>> =
839        const { std::cell::RefCell::new(None) };
840}
841
842/// RAII guard that restores the prior `EMIT_FIELD_CONTEXT` value on drop.
843struct FieldContextGuard(Option<String>);
844
845impl Drop for FieldContextGuard {
846    fn drop(&mut self) {
847        EMIT_FIELD_CONTEXT.with(|f| *f.borrow_mut() = self.0.take());
848    }
849}
850
851fn push_field_context(name: &str) -> FieldContextGuard {
852    let prev = EMIT_FIELD_CONTEXT.with(|f| f.borrow_mut().replace(name.to_owned()));
853    FieldContextGuard(prev)
854}
855
856/// Clear the field context for the duration of a child-context walk.
857/// The child's own production has its own FIELDs that set their own
858/// context; the outer field hint must not leak into them.
859fn clear_field_context() -> FieldContextGuard {
860    let prev = EMIT_FIELD_CONTEXT.with(|f| f.borrow_mut().take());
861    FieldContextGuard(prev)
862}
863
864fn current_field_context() -> Option<String> {
865    EMIT_FIELD_CONTEXT.with(|f| f.borrow().clone())
866}
867
868/// Walk a rule at a vertex inside a μ-binder. The wrapping frame is
869/// pushed before recursion and popped after, so any SYMBOL inside
870/// `rule` that re-enters the same `(vertex_id, rule_name)` pair
871/// returns the empty sequence (μ X . body unfolds once).
872fn walk_in_mu_frame(
873    protocol: &str,
874    schema: &Schema,
875    grammar: &Grammar,
876    vertex_id: &panproto_gat::Name,
877    rule_name: &str,
878    rule: &Production,
879    cursor: &mut ChildCursor<'_>,
880    out: &mut Output<'_>,
881) -> Result<(), ParseError> {
882    let key = (vertex_id.to_string(), rule_name.to_owned());
883    let inserted = EMIT_MU_FRAMES.with(|frames| frames.borrow_mut().insert(key.clone()));
884    if !inserted {
885        // We are already walking this rule at this vertex deeper in
886        // the call stack. The coinductive μ-fixed-point reading
887        // returns the empty sequence here; the surrounding
888        // production resumes after the SYMBOL.
889        return Ok(());
890    }
891    let result = emit_production(protocol, schema, grammar, vertex_id, rule, cursor, out);
892    EMIT_MU_FRAMES.with(|frames| {
893        frames.borrow_mut().remove(&key);
894    });
895    result
896}
897
898fn emit_production(
899    protocol: &str,
900    schema: &Schema,
901    grammar: &Grammar,
902    vertex_id: &panproto_gat::Name,
903    production: &Production,
904    cursor: &mut ChildCursor<'_>,
905    out: &mut Output<'_>,
906) -> Result<(), ParseError> {
907    let depth = EMIT_DEPTH.with(|d| {
908        let v = d.get() + 1;
909        d.set(v);
910        v
911    });
912    if depth > 500 {
913        EMIT_DEPTH.with(|d| d.set(d.get() - 1));
914        return Err(ParseError::EmitFailed {
915            protocol: protocol.to_owned(),
916            reason: format!(
917                "emit_production recursion >500 (likely a cyclic grammar; \
918                     vertex='{vertex_id}')"
919            ),
920        });
921    }
922    drain_extras(protocol, schema, grammar, cursor, out)?;
923    let result = emit_production_inner(
924        protocol, schema, grammar, vertex_id, production, cursor, out,
925    );
926    EMIT_DEPTH.with(|d| d.set(d.get() - 1));
927    result
928}
929
930/// Consume and emit every leading edge on `cursor` whose target kind
931/// is in `grammar.extras` (typically `line_comment` / `block_comment`).
932/// Extras live outside the production grammar — tree-sitter skips them
933/// at parse time and records them as children of the surrounding
934/// vertex — so the rule walker cannot reconcile them against the
935/// cursor. Draining them as a side channel preserves their content in
936/// the output without confusing the structural matchers.
937fn drain_extras(
938    protocol: &str,
939    schema: &Schema,
940    grammar: &Grammar,
941    cursor: &mut ChildCursor<'_>,
942    out: &mut Output<'_>,
943) -> Result<(), ParseError> {
944    if grammar.extras.is_empty() {
945        return Ok(());
946    }
947    loop {
948        let next_extra: Option<usize> = cursor
949            .edges
950            .iter()
951            .enumerate()
952            .find(|(i, _)| !cursor.consumed[*i])
953            .and_then(|(i, edge)| {
954                let kind = schema.vertices.get(&edge.tgt).map(|v| v.kind.as_ref())?;
955                if grammar.extras.contains(kind) {
956                    Some(i)
957                } else {
958                    None
959                }
960            });
961        let Some(idx) = next_extra else {
962            return Ok(());
963        };
964        cursor.consumed[idx] = true;
965        let target = &cursor.edges[idx].tgt;
966        emit_vertex(protocol, schema, grammar, target, out)?;
967    }
968}
969
970fn emit_production_inner(
971    protocol: &str,
972    schema: &Schema,
973    grammar: &Grammar,
974    vertex_id: &panproto_gat::Name,
975    production: &Production,
976    cursor: &mut ChildCursor<'_>,
977    out: &mut Output<'_>,
978) -> Result<(), ParseError> {
979    match production {
980        Production::String { value } => {
981            out.token(value);
982            Ok(())
983        }
984        Production::Pattern { value } => {
985            if let Some(literal) = literal_value(schema, vertex_id) {
986                out.token(literal);
987            } else if is_newline_like_pattern(value) {
988                // Patterns like `\r?\n`, `\n`, `\r\n` are the structural
989                // newline tokens grammars use to separate top-level
990                // statements (csound's `_new_line`, ABC's line-end, etc.).
991                // Emitting them through the placeholder fallback rendered
992                // the bare `_` sentinel between siblings; route them to
993                // the layout pass's line-break instead so the output
994                // re-parses.
995                out.newline();
996            } else if is_whitespace_only_pattern(value) {
997                // `\s+`, `[ \t]+` and friends are interstitial whitespace
998                // tokens. Emit nothing: the layout pass inserts the
999                // policy separator between adjacent Lits if needed.
1000            } else {
1001                out.token(&placeholder_for_pattern(value));
1002            }
1003            Ok(())
1004        }
1005        Production::Blank => Ok(()),
1006        Production::Symbol { name } => {
1007            // Inside a FIELD body, a SYMBOL consumes by field-name on
1008            // the outer cursor rather than searching by symbol-match.
1009            // This covers the simple `FIELD(SYMBOL X)` case as well as
1010            // every nesting under FIELD that contains SYMBOLs (SEQ,
1011            // CHOICE, REPEAT, OPTIONAL, ALIAS). Without the override,
1012            // shapes like `field('args', commaSep1($.X))` consume one
1013            // field edge in the FIELD handler and then the REPEAT
1014            // inside SEQ searches the consumed child's cursor — where
1015            // no sibling field edges sit — and breaks after one
1016            // iteration.
1017            if let Some(field) = current_field_context() {
1018                if let Some(edge) = cursor.take_field(&field) {
1019                    return emit_in_child_context(
1020                        protocol, schema, grammar, &edge.tgt, production, out,
1021                    );
1022                }
1023                // No matching field-named edge left on the outer
1024                // cursor. Surface nothing; the surrounding REPEAT /
1025                // OPTIONAL / CHOICE backtracks the literal tokens it
1026                // emitted on this iteration when it sees no progress.
1027                return Ok(());
1028            }
1029            if name.starts_with('_') {
1030                // Hidden rule: not a vertex kind on the schema side.
1031                // Inline-expand the rule body so its children take
1032                // edges from the current cursor, instead of trying to
1033                // take a single child edge that "satisfies" the
1034                // hidden rule and discarding the rest of the body
1035                // (which would drop tokens like `=` and the trailing
1036                // value SYMBOL inside e.g. TOML's `_inline_pair`).
1037                //
1038                // Wrapped in a μ-frame so a hidden rule that
1039                // references its own kind cyclically (or another
1040                // hidden rule that closes the cycle) unfolds once
1041                // and then collapses to the empty sequence at the
1042                // second visit, rather than blowing the stack.
1043                if let Some(rule) = grammar.rules.get(name) {
1044                    walk_in_mu_frame(
1045                        protocol, schema, grammar, vertex_id, name, rule, cursor, out,
1046                    )
1047                } else {
1048                    // External hidden rule (declared in the
1049                    // grammar's `externals` block, scanned by C code,
1050                    // not listed in `rules`). Heuristic fallback by
1051                    // name:
1052                    //
1053                    // - `_indent` / `*_indent`: open an indent block.
1054                    //   Indent-based grammars (Python, YAML, qvr)
1055                    //   declare an `_indent` external scanner before
1056                    //   the body of a block-bodied declaration; the
1057                    //   emitted output is unparseable without the
1058                    //   corresponding indentation jump.
1059                    // - `_dedent` / `*_dedent`: close the matching
1060                    //   indent block.
1061                    // - `_newline` / `*_line_ending` / `*_or_eof`:
1062                    //   universally newline-or-empty; emitting a
1063                    //   single newline is the right default for
1064                    //   grammars like TOML whose `pair` SEQ trails
1065                    //   into `_line_ending_or_eof`.
1066                    //
1067                    // Anything else falls through silently — better
1068                    // to drop an unknown external token than to
1069                    // invent one that confuses re-parsing.
1070                    if name == "_indent" || name.ends_with("_indent") {
1071                        out.indent_open();
1072                    } else if name == "_dedent" || name.ends_with("_dedent") {
1073                        out.indent_close();
1074                    } else if name.contains("line_ending")
1075                        || name.contains("newline")
1076                        || name.ends_with("_or_eof")
1077                    {
1078                        out.newline();
1079                    } else if name.contains("semicolon") {
1080                        out.token(";");
1081                    }
1082                    Ok(())
1083                }
1084            } else if let Some(edge) = take_symbol_match(grammar, schema, cursor, name) {
1085                // For supertype / hidden-rule dispatch the child's
1086                // own kind names the actual production to walk
1087                // (`child.kind` IS the subtype). For ALIAS the
1088                // dependent-optic context is carried by the
1089                // surrounding `Production::Alias` branch, which calls
1090                // `emit_aliased_child` directly; we don't reach here
1091                // for that case. So walking `grammar.rules[child.kind]`
1092                // via `emit_vertex` is correct: the dependent-optic
1093                // path is preserved at every site where it actually
1094                // diverges from `child.kind`.
1095                emit_vertex(protocol, schema, grammar, &edge.tgt, out)
1096            } else if vertex_id_kind(schema, vertex_id) == Some(name.as_str()) {
1097                let rule = grammar
1098                    .rules
1099                    .get(name)
1100                    .ok_or_else(|| ParseError::EmitFailed {
1101                        protocol: protocol.to_owned(),
1102                        reason: format!("no production for SYMBOL '{name}'"),
1103                    })?;
1104                // Self-reference (`X = ... SYMBOL X ...`): wrap in a
1105                // μ-frame so re-entry collapses to the empty sequence.
1106                walk_in_mu_frame(
1107                    protocol, schema, grammar, vertex_id, name, rule, cursor, out,
1108                )
1109            } else {
1110                // Named rule with no matching child: emit nothing and
1111                // let the surrounding CHOICE / OPTIONAL / REPEAT
1112                // resolve the absence.
1113                Ok(())
1114            }
1115        }
1116        Production::Seq { members } => {
1117            for member in members {
1118                emit_production(protocol, schema, grammar, vertex_id, member, cursor, out)?;
1119            }
1120            Ok(())
1121        }
1122        Production::Choice { members } => {
1123            if let Some(matched) =
1124                pick_choice_with_cursor(schema, grammar, vertex_id, cursor, members)
1125            {
1126                emit_production(protocol, schema, grammar, vertex_id, matched, cursor, out)
1127            } else {
1128                Ok(())
1129            }
1130        }
1131        Production::Repeat { content } | Production::Repeat1 { content } => {
1132            // Detect a "separator-leading SEQ" iteration body: SEQ whose
1133            // first member is a CHOICE containing BLANK (or an OPTIONAL),
1134            // i.e. the source-level separator between two iterations is
1135            // syntactically optional. When the chosen alternative for
1136            // that separator slot emits zero content tokens at runtime,
1137            // there was no source-level separator between this iteration
1138            // and the previous one; the layout pass must suppress its
1139            // policy separator to match the source's tight adjacency.
1140            //
1141            // Categorical reading: REPEAT body `B = SEQ(SEP, BODY)` is
1142            // the pullback of two halves. The bytes emitted in iteration
1143            // k+1 are a concatenation of `SEP_k+1` and `BODY_k+1`; if
1144            // `SEP_k+1` is the empty word, the concatenation of
1145            // `BODY_k` and `BODY_k+1` must remain a single contiguous
1146            // span. Hence the NoSpace marker.
1147            let separator_leading_seq: Option<&[Production]> = match content.as_ref() {
1148                Production::Seq { members } if members.len() >= 2 => {
1149                    let first = &members[0];
1150                    let is_separator_slot = match first {
1151                        Production::Choice { members } => {
1152                            members.iter().any(|m| matches!(m, Production::Blank))
1153                        }
1154                        Production::Optional { .. } => true,
1155                        _ => false,
1156                    };
1157                    if is_separator_slot {
1158                        Some(members.as_slice())
1159                    } else {
1160                        None
1161                    }
1162                }
1163                _ => None,
1164            };
1165
1166            let mut emitted_any = false;
1167            loop {
1168                let cursor_snap = cursor.consumed.clone();
1169                let out_snap = out.snapshot();
1170                let consumed_before = cursor.consumed.iter().filter(|&&c| c).count();
1171                let result: Result<(), ParseError> =
1172                    if let Some(seq_members) = separator_leading_seq {
1173                        // Emit the separator slot first and observe
1174                        // whether it contributed any Lit. If not, push
1175                        // a NoSpace marker before walking the remaining
1176                        // SEQ members. The OutputSnapshot here covers
1177                        // only the separator's emission window.
1178                        let pre_sep = out.snapshot();
1179                        let sep_result = emit_production(
1180                            protocol,
1181                            schema,
1182                            grammar,
1183                            vertex_id,
1184                            &seq_members[0],
1185                            cursor,
1186                            out,
1187                        );
1188                        match sep_result {
1189                            Err(e) => Err(e),
1190                            Ok(()) => {
1191                                if !out.lit_emitted_since(pre_sep) {
1192                                    out.no_space();
1193                                }
1194                                let mut rest_result = Ok(());
1195                                for member in &seq_members[1..] {
1196                                    rest_result = emit_production(
1197                                        protocol, schema, grammar, vertex_id, member, cursor, out,
1198                                    );
1199                                    if rest_result.is_err() {
1200                                        break;
1201                                    }
1202                                }
1203                                rest_result
1204                            }
1205                        }
1206                    } else {
1207                        emit_production(protocol, schema, grammar, vertex_id, content, cursor, out)
1208                    };
1209                let consumed_after = cursor.consumed.iter().filter(|&&c| c).count();
1210                if result.is_err() || consumed_after == consumed_before {
1211                    cursor.consumed = cursor_snap;
1212                    out.restore(out_snap);
1213                    break;
1214                }
1215                emitted_any = true;
1216            }
1217            if matches!(production, Production::Repeat1 { .. }) && !emitted_any {
1218                emit_production(protocol, schema, grammar, vertex_id, content, cursor, out)?;
1219            }
1220            Ok(())
1221        }
1222        Production::Optional { content } => {
1223            let cursor_snap = cursor.consumed.clone();
1224            let out_snap = out.snapshot();
1225            let consumed_before = cursor.consumed.iter().filter(|&&c| c).count();
1226            let result =
1227                emit_production(protocol, schema, grammar, vertex_id, content, cursor, out);
1228            // OPTIONAL is a backtracking site: if the inner production
1229            // errored *or* made no progress without leaving a witness
1230            // constraint, restore both cursor and output to their
1231            // pre-attempt state. Mirrors `Repeat`'s loop body.
1232            if result.is_err() {
1233                cursor.consumed = cursor_snap;
1234                out.restore(out_snap);
1235                return result;
1236            }
1237            let consumed_after = cursor.consumed.iter().filter(|&&c| c).count();
1238            if consumed_after == consumed_before
1239                && !has_relevant_constraint(content, schema, vertex_id)
1240            {
1241                cursor.consumed = cursor_snap;
1242                out.restore(out_snap);
1243            }
1244            Ok(())
1245        }
1246        Production::Field { name, content } => {
1247            // Set the field context for the duration of `content`'s
1248            // walk and emit the content against the *outer* cursor.
1249            // The SYMBOL handler picks up the context and pulls
1250            // successive `take_field(name)` edges as it encounters
1251            // SYMBOLs anywhere under `content` (under SEQ, CHOICE,
1252            // REPEAT, OPTIONAL, ALIAS — arbitrarily nested). This
1253            // subsumes the prior carve-outs for FIELD(REPEAT(...)),
1254            // FIELD(REPEAT1(...)), and the bare FIELD(SYMBOL ...)
1255            // case, and adds coverage for
1256            // `field('xs', commaSep1($.X))` which expands to
1257            // FIELD(SEQ(SYMBOL X, REPEAT(SEQ(',', SYMBOL X)))) and
1258            // any other shape where REPEAT/REPEAT1 sits inside SEQ /
1259            // CHOICE / OPTIONAL under a FIELD. A FIELD that wraps a
1260            // non-SYMBOL production (e.g. `field('op', '+')` or
1261            // `field('op', CHOICE(STRING ...))`) still works: STRING
1262            // handlers ignore the context and emit literals
1263            // directly, so the operator token survives the round
1264            // trip.
1265            let _guard = push_field_context(name);
1266            emit_production(protocol, schema, grammar, vertex_id, content, cursor, out)
1267        }
1268        Production::Alias {
1269            content,
1270            named,
1271            value,
1272        } => {
1273            // A named ALIAS rewrites the parser-visible kind to
1274            // `value`. If the cursor has an unconsumed child whose
1275            // kind matches that alias name, take it and emit the
1276            // child using the alias's INNER content as the rule
1277            // (e.g. `ALIAS { SYMBOL real_rule, value: "kind_x" }`
1278            // means a `kind_x` vertex on the schema should be walked
1279            // through `real_rule`'s body, not through whatever rule
1280            // happens to be keyed under `kind_x`). This is the
1281            // dependent-optic shape: the rule the emitter walks at a
1282            // child position is determined by the parent's chosen
1283            // alias, not by the child kind alone — without it,
1284            // grammars like YAML that introduce the same kind through
1285            // many ALIAS sites lose the parent context the moment
1286            // emit_vertex is called.
1287            if *named && !value.is_empty() {
1288                if let Some(edge) = cursor.take_matching(|edge| {
1289                    schema
1290                        .vertices
1291                        .get(&edge.tgt)
1292                        .map(|v| v.kind.as_ref() == value.as_str())
1293                        .unwrap_or(false)
1294                }) {
1295                    return emit_aliased_child(protocol, schema, grammar, &edge.tgt, content, out);
1296                }
1297            }
1298            emit_production(protocol, schema, grammar, vertex_id, content, cursor, out)
1299        }
1300        Production::Token { content }
1301        | Production::ImmediateToken { content }
1302        | Production::Prec { content, .. }
1303        | Production::PrecLeft { content, .. }
1304        | Production::PrecRight { content, .. }
1305        | Production::PrecDynamic { content, .. }
1306        | Production::Reserved { content, .. } => {
1307            emit_production(protocol, schema, grammar, vertex_id, content, cursor, out)
1308        }
1309    }
1310}
1311
1312/// Take the next cursor edge whose target vertex's kind matches the
1313/// SYMBOL `name` directly or via inline expansion of a hidden rule.
1314fn take_symbol_match<'a>(
1315    grammar: &Grammar,
1316    schema: &Schema,
1317    cursor: &mut ChildCursor<'a>,
1318    name: &str,
1319) -> Option<&'a Edge> {
1320    cursor.take_matching(|edge| {
1321        let target_kind = schema.vertices.get(&edge.tgt).map(|v| v.kind.as_ref());
1322        kind_satisfies_symbol(grammar, target_kind, name)
1323    })
1324}
1325
1326/// Decide whether a schema vertex of kind `target_kind` satisfies a
1327/// SYMBOL `name` reference in the grammar.
1328///
1329/// Operates as an O(1) lookup against the precomputed subtype
1330/// closure built at [`Grammar::from_bytes`]. The semantic content is
1331/// "K satisfies SYMBOL S iff K is reachable from S by walking the
1332/// grammar's hidden, supertype, and named-alias dispatch": this is
1333/// exactly the relation tree-sitter induces on `(parser-visible kind,
1334/// rule-position)` pairs.
1335fn kind_satisfies_symbol(grammar: &Grammar, target_kind: Option<&str>, name: &str) -> bool {
1336    let Some(target) = target_kind else {
1337        return false;
1338    };
1339    if target == name {
1340        return true;
1341    }
1342    if grammar
1343        .subtypes
1344        .get(target)
1345        .is_some_and(|set| set.contains(name))
1346    {
1347        return true;
1348    }
1349    grammar
1350        .subtypes
1351        .get(name)
1352        .is_some_and(|set| set.contains(target))
1353}
1354
1355/// Direct match only: `target_kind` equals `name` or is in `name`'s
1356/// subtype set, without the reverse lookup. Used by the first pass of
1357/// cursor-driven CHOICE dispatch to prefer exact matches.
1358fn kind_directly_satisfies_symbol(
1359    grammar: &Grammar,
1360    target_kind: Option<&str>,
1361    name: &str,
1362) -> bool {
1363    let Some(target) = target_kind else {
1364        return false;
1365    };
1366    if target == name {
1367        return true;
1368    }
1369    grammar
1370        .subtypes
1371        .get(name)
1372        .is_some_and(|set| set.contains(target))
1373}
1374
1375/// Emit a child reached through an ALIAS production using the
1376/// alias's inner content as the rule, not `grammar.rules[child.kind]`.
1377///
1378/// This carries the dependent-optic context across the ALIAS edge:
1379/// at the parent rule's site we know which underlying production the
1380/// alias wraps (typically `SYMBOL real_rule`), and that's the
1381/// production that should drive the emit walk on the child's
1382/// children. Looking up `grammar.rules.get(child.kind)` instead would
1383/// either fail (the renamed kind has no top-level rule, e.g. YAML's
1384/// `block_mapping_pair`) or pick an arbitrary same-kinded rule from
1385/// elsewhere in the grammar.
1386///
1387/// Walk-context invariant. The dependent-optic shape of `emit_pretty`
1388/// says: the production walked at any vertex is determined by the
1389/// path from the root through the grammar, not by the vertex kind in
1390/// isolation. Two dispatch sites realise that invariant:
1391///
1392/// * [`emit_vertex`] looks up `grammar.rules[child.kind]` and walks
1393///   it. Correct for supertype / hidden-rule dispatch: the child's
1394///   kind on the schema IS the subtype tree-sitter selected, so its
1395///   top-level rule is the right production to walk.
1396/// * `emit_aliased_child` threads the parent rule's `Production`
1397///   directly (the inner `content` of `Production::Alias`) and walks
1398///   it on the child's children. Correct for ALIAS dispatch: the
1399///   child's kind on the schema is the alias's `value` (a renamed
1400///   kind that may have no top-level rule), and the production to
1401///   walk is the alias's content body, supplied by the parent.
1402///
1403/// Together these cover every site where the rule-walked-at-child
1404/// diverges from `grammar.rules[child.kind]`; the recursion site for
1405/// plain SYMBOL therefore correctly delegates to `emit_vertex`, and
1406/// we do not need a richer `WalkContext` value passed by reference.
1407/// The grammar dependency is the thread.
1408fn emit_aliased_child(
1409    protocol: &str,
1410    schema: &Schema,
1411    grammar: &Grammar,
1412    child_id: &panproto_gat::Name,
1413    content: &Production,
1414    out: &mut Output<'_>,
1415) -> Result<(), ParseError> {
1416    // Leaf shortcut: if the child has a literal-value and no
1417    // structural children, emit the captured text. Identifiers and
1418    // similar terminals reach here when an ALIAS wraps a SYMBOL that
1419    // resolves to a PATTERN.
1420    if let Some(literal) = literal_value(schema, child_id) {
1421        if children_for(schema, child_id).is_empty() {
1422            out.token(literal);
1423            return Ok(());
1424        }
1425    }
1426
1427    // Resolve `content` to a rule when it's a SYMBOL (the dominant
1428    // shape: `ALIAS { content: SYMBOL real_rule, value: "kind_x" }`).
1429    if let Production::Symbol { name } = content {
1430        if let Some(rule) = grammar.rules.get(name) {
1431            let edges = children_for(schema, child_id);
1432            let mut cursor = ChildCursor::new(&edges);
1433            return emit_production(protocol, schema, grammar, child_id, rule, &mut cursor, out);
1434        }
1435    }
1436
1437    // Other ALIAS contents (CHOICE, SEQ, literals) walk in place.
1438    let edges = children_for(schema, child_id);
1439    let mut cursor = ChildCursor::new(&edges);
1440    emit_production(
1441        protocol,
1442        schema,
1443        grammar,
1444        child_id,
1445        content,
1446        &mut cursor,
1447        out,
1448    )
1449}
1450
1451fn emit_in_child_context(
1452    protocol: &str,
1453    schema: &Schema,
1454    grammar: &Grammar,
1455    child_id: &panproto_gat::Name,
1456    production: &Production,
1457    out: &mut Output<'_>,
1458) -> Result<(), ParseError> {
1459    // The child walks under its own production tree, with its own
1460    // FIELDs setting their own contexts. Clear the outer FIELD hint
1461    // so it does not leak through and cause sibling SYMBOLs inside
1462    // the child's body to mistakenly pull edges from the child's
1463    // cursor by the parent's field name.
1464    let _guard = clear_field_context();
1465    // If `production` is a structural wrapper (CHOICE / SEQ /
1466    // OPTIONAL / ...) whose referenced symbols cover the child's own
1467    // kind, the child IS the production's target node and the right
1468    // emit path is `emit_vertex(child)` (which honours the
1469    // literal-value leaf shortcut). Without this guard, FIELD(pattern,
1470    // CHOICE { _pattern, self }) on an identifier child walks the
1471    // CHOICE on the identifier's empty cursor, falls through to the
1472    // first non-BLANK alt, and loses the captured identifier text.
1473    if !matches!(production, Production::Symbol { .. }) {
1474        let child_kind = schema.vertices.get(child_id).map(|v| v.kind.as_ref());
1475        let symbols = referenced_symbols(production);
1476        if symbols
1477            .iter()
1478            .any(|s| kind_satisfies_symbol(grammar, child_kind, s) || child_kind == Some(s))
1479        {
1480            return emit_vertex(protocol, schema, grammar, child_id, out);
1481        }
1482    }
1483    match production {
1484        Production::Symbol { .. } => emit_vertex(protocol, schema, grammar, child_id, out),
1485        _ => {
1486            let edges = children_for(schema, child_id);
1487            let mut cursor = ChildCursor::new(&edges);
1488            emit_production(
1489                protocol,
1490                schema,
1491                grammar,
1492                child_id,
1493                production,
1494                &mut cursor,
1495                out,
1496            )
1497        }
1498    }
1499}
1500
1501fn pick_choice_with_cursor<'a>(
1502    schema: &Schema,
1503    grammar: &Grammar,
1504    vertex_id: &panproto_gat::Name,
1505    cursor: &ChildCursor<'_>,
1506    alternatives: &'a [Production],
1507) -> Option<&'a Production> {
1508    // Discriminator-driven dispatch (highest priority): when the
1509    // walker recorded a `chose-alt-fingerprint` constraint at parse
1510    // time, dispatch directly against that. This is the categorical
1511    // discriminator: it survives stripping of byte-position
1512    // constraints (so by-construction round-trips work) and is the
1513    // explicit witness of which CHOICE alternative the parser took.
1514    //
1515    // Falls back to the live `interstitial-*` substring blob when no
1516    // fingerprint is present (e.g. instances built by callers that
1517    // bypass the AstWalker). Both blobs are scored by the longest
1518    // STRING-literal token in an alternative that matches; the
1519    // length tiebreak prefers `&&` over `&`, `==` over `=`, etc.
1520    let constraint_blob = schema
1521        .constraints
1522        .get(vertex_id)
1523        .map(|cs| {
1524            let fingerprint: Option<&str> = cs
1525                .iter()
1526                .find(|c| c.sort.as_ref() == "chose-alt-fingerprint")
1527                .map(|c| c.value.as_str());
1528            if let Some(fp) = fingerprint {
1529                fp.to_owned()
1530            } else {
1531                cs.iter()
1532                    .filter(|c| {
1533                        let s = c.sort.as_ref();
1534                        s.starts_with("interstitial-") && !s.ends_with("-start-byte")
1535                    })
1536                    .map(|c| c.value.as_str())
1537                    .collect::<Vec<&str>>()
1538                    .join(" ")
1539            }
1540        })
1541        .unwrap_or_default();
1542    let child_kinds: Vec<&str> = schema
1543        .constraints
1544        .get(vertex_id)
1545        .and_then(|cs| {
1546            cs.iter()
1547                .find(|c| c.sort.as_ref() == "chose-alt-child-kinds")
1548                .map(|c| c.value.split_whitespace().collect())
1549        })
1550        .unwrap_or_default();
1551    // Cursor-exhaustion BLANK-preference: when all cursor edges have
1552    // been consumed AND `BLANK` is one of the alternatives, the only
1553    // alt that won't introduce a non-existent child is `BLANK`.
1554    //
1555    // This gate fires before the literal-blob discriminator because
1556    // the fingerprint is shared across every CHOICE position in the
1557    // vertex's rule body: a vertex like `sample_step` that ends in
1558    // `..., REPEAT(SEQ(",", arg)), CHOICE(",", BLANK)` records all of
1559    // its `","` interstitials in a single blob, so the literal-score
1560    // matcher would otherwise prefer `","` for the trailing CHOICE
1561    // even when the source had no trailing comma. By the time the
1562    // emitter reaches the trailing CHOICE, the REPEAT has consumed
1563    // every arg edge in cursor order; the residual unconsumed multiset
1564    // is empty; and the categorical reading of a CHOICE-with-BLANK at
1565    // a position with no remaining children is the no-op alternative.
1566    let any_unconsumed = cursor
1567        .edges
1568        .iter()
1569        .enumerate()
1570        .any(|(i, _)| !cursor.consumed[i]);
1571    let blank_present = alternatives.iter().any(|a| matches!(a, Production::Blank));
1572    if !any_unconsumed && blank_present {
1573        return alternatives.iter().find(|a| matches!(a, Production::Blank));
1574    }
1575
1576    if !constraint_blob.is_empty() {
1577        // Primary score: literal-token match length. This dominates
1578        // alt selection so existing language tests that depend on
1579        // literal-only fingerprints keep working.
1580        // Secondary score (tiebreaker only): named-symbol kind match
1581        // count, read from the separate `chose-alt-child-kinds`
1582        // constraint (kept apart from the literal fingerprint so
1583        // identifiers like `:` in the kind list don't contaminate the
1584        // literal match). An alt that matches the recorded kinds is a
1585        // stronger witness than one whose only
1586        // overlap is literal punctuation.
1587        let mut best_literal: usize = 0;
1588        let mut best_symbols: usize = 0;
1589        let mut best_alt: Option<&Production> = None;
1590        let mut tied = false;
1591        for alt in alternatives {
1592            let strings = literal_strings(alt);
1593            if strings.is_empty() {
1594                continue;
1595            }
1596            let literal_score = strings
1597                .iter()
1598                .filter(|s| constraint_blob.contains(s.as_str()))
1599                .map(String::len)
1600                .sum::<usize>();
1601            if literal_score == 0 {
1602                continue;
1603            }
1604            // Symbol score is computed only as a tiebreaker among alts
1605            // whose literal-token coverage is the same; it never lifts
1606            // an alt above one with a strictly higher literal score.
1607            // Reads the `chose-alt-child-kinds` constraint (a separate
1608            // sequence the walker emits, kept apart from the literal
1609            // fingerprint to avoid cross-contamination).
1610            let symbol_score = if literal_score >= best_literal && !child_kinds.is_empty() {
1611                let symbols = referenced_symbols(alt);
1612                symbols
1613                    .iter()
1614                    .filter(|sym| {
1615                        let sym_str: &str = sym;
1616                        if child_kinds.contains(&sym_str) {
1617                            return true;
1618                        }
1619                        grammar.subtypes.get(sym_str).is_some_and(|sub_set| {
1620                            sub_set
1621                                .iter()
1622                                .any(|sub| child_kinds.contains(&sub.as_str()))
1623                        })
1624                    })
1625                    .count()
1626            } else {
1627                0
1628            };
1629            let better = literal_score > best_literal
1630                || (literal_score == best_literal && symbol_score > best_symbols);
1631            let same = literal_score == best_literal && symbol_score == best_symbols;
1632            if better {
1633                best_literal = literal_score;
1634                best_symbols = symbol_score;
1635                best_alt = Some(alt);
1636                tied = false;
1637            } else if same && best_alt.is_some() {
1638                tied = true;
1639            }
1640        }
1641        // Only commit to an alt when the fingerprint discriminates it
1642        // uniquely. A tie means the alts share the same literal token
1643        // set (e.g. JSON's `string = CHOICE { SEQ { '"', '"' }, SEQ {
1644        // '"', _string_content, '"' } }` — both alts contain just the
1645        // two `"` tokens). In that case fall through to cursor-based
1646        // dispatch, which uses the actual edge structure.
1647        if let Some(alt) = best_alt {
1648            if !tied {
1649                return Some(alt);
1650            }
1651        }
1652    }
1653
1654    // Cursor-driven dispatch: pick the alternative whose body
1655    // references a SYMBOL covering the *first unconsumed* edge in
1656    // cursor order. `referenced_symbols` walks the alternative
1657    // recursively (across nested SEQs, REPEATs, OPTIONALs, FIELDs,
1658    // etc.) so a leading optional like `attribute_item` does not
1659    // block matching when only the trailing required symbol is
1660    // present on the schema.
1661    //
1662    // Ordering by the first unconsumed edge (rather than picking any
1663    // alternative whose SYMBOL set intersects the unconsumed
1664    // multiset) is what preserves schema edge order under
1665    // REPEAT(CHOICE(...)) productions. Without this rule, alt order
1666    // in the grammar's CHOICE determines the emission order, and a
1667    // schema with interleaved kinds like `[symbol, punct, int,
1668    // symbol, punct, int]` re-fuses to `[symbol, symbol, punct,
1669    // punct, int, int]` when emitted then re-parsed. The fix is the
1670    // categorical reading of REPEAT-over-list (list-shaped fold)
1671    // rather than REPEAT-over-multiset (unordered fold).
1672    let first_unconsumed_kind: Option<&str> = cursor
1673        .edges
1674        .iter()
1675        .enumerate()
1676        .find(|(i, _)| !cursor.consumed[*i])
1677        .and_then(|(_, edge)| schema.vertices.get(&edge.tgt).map(|v| v.kind.as_ref()));
1678    if let Some(target_kind) = first_unconsumed_kind {
1679        // Prefer an alternative with a direct symbol match before
1680        // accepting a subtype match. Without this two-pass approach,
1681        // a CHOICE like `[ALIAS(_qualified_macro_identifier, ...),
1682        // SYMBOL macro_identifier]` incorrectly picks the ALIAS
1683        // alternative when the cursor holds a `macro_identifier`
1684        // child: the subtype closure records `macro_identifier` as
1685        // reachable from `_qualified_macro_identifier` (because the
1686        // hidden rule's SEQ contains it), so the first alt's symbol
1687        // set satisfies the cursor. The direct-match pass ensures the
1688        // second alt (which references `macro_identifier` directly)
1689        // wins.
1690        for alt in alternatives {
1691            let symbols = referenced_symbols(alt);
1692            if !symbols.is_empty()
1693                && symbols
1694                    .iter()
1695                    .any(|s| kind_directly_satisfies_symbol(grammar, Some(target_kind), s))
1696            {
1697                return Some(alt);
1698            }
1699        }
1700        for alt in alternatives {
1701            let symbols = referenced_symbols(alt);
1702            if !symbols.is_empty()
1703                && symbols
1704                    .iter()
1705                    .any(|s| kind_satisfies_symbol(grammar, Some(target_kind), s))
1706            {
1707                return Some(alt);
1708            }
1709        }
1710    }
1711
1712    // FIELD dispatch: pick an alternative whose FIELD name matches an
1713    // unconsumed edge kind.
1714    let edge_kinds: Vec<&str> = cursor
1715        .edges
1716        .iter()
1717        .enumerate()
1718        .filter(|(i, _)| !cursor.consumed[*i])
1719        .map(|(_, e)| e.kind.as_ref())
1720        .collect();
1721    for alt in alternatives {
1722        if has_field_in(alt, &edge_kinds) {
1723            return Some(alt);
1724        }
1725    }
1726
1727    // No cursor-driven match. Fall back to:
1728    //
1729    // - BLANK (the explicit empty alternative) when present, so an
1730    //   OPTIONAL-shaped CHOICE compiles to nothing.
1731    // - The first non-`BLANK` alternative as a last resort, used by
1732    //   STRING-only alternatives (keyword tokens) and other choices
1733    //   that don't reach the cursor.
1734    //
1735    // The previous "match own_kind" branch is intentionally absent:
1736    // when an alt's first SYMBOL equals the current vertex's kind, the
1737    // caller is already emitting that vertex's own rule. Recursing
1738    // into the alt would cause a self-loop in the rule walk.
1739    let _ = (schema, vertex_id);
1740    if alternatives.iter().any(|a| matches!(a, Production::Blank)) {
1741        return alternatives.iter().find(|a| matches!(a, Production::Blank));
1742    }
1743    alternatives
1744        .iter()
1745        .find(|alt| !matches!(alt, Production::Blank))
1746}
1747
1748/// Collect every literal STRING token directly inside `production`
1749/// (without descending into SYMBOLs / hidden rules). Used to score
1750/// CHOICE alternatives against the parent vertex's interstitials so
1751/// the right operator / keyword form is picked when the schema
1752/// preserves interstitial fragments from a prior parse.
1753fn literal_strings(production: &Production) -> Vec<String> {
1754    let mut out = Vec::new();
1755    fn walk(p: &Production, out: &mut Vec<String>) {
1756        match p {
1757            Production::String { value } if !value.is_empty() => {
1758                out.push(value.clone());
1759            }
1760            Production::Choice { members } | Production::Seq { members } => {
1761                for m in members {
1762                    walk(m, out);
1763                }
1764            }
1765            Production::Repeat { content }
1766            | Production::Repeat1 { content }
1767            | Production::Optional { content }
1768            | Production::Field { content, .. }
1769            | Production::Alias { content, .. }
1770            | Production::Token { content }
1771            | Production::ImmediateToken { content }
1772            | Production::Prec { content, .. }
1773            | Production::PrecLeft { content, .. }
1774            | Production::PrecRight { content, .. }
1775            | Production::PrecDynamic { content, .. }
1776            | Production::Reserved { content, .. } => walk(content, out),
1777            _ => {}
1778        }
1779    }
1780    walk(production, &mut out);
1781    out
1782}
1783
1784/// Collect every SYMBOL name reachable from `production` without
1785/// crossing into nested rules. Used by `pick_choice_with_cursor` to
1786/// rank alternatives by "any SYMBOL inside this alt matches something
1787/// on the cursor", instead of just the first SYMBOL: a leading
1788/// optional like `attribute_item` then `parameter` is otherwise
1789/// rejected when only the parameter children are present.
1790fn referenced_symbols(production: &Production) -> Vec<&str> {
1791    let mut out = Vec::new();
1792    fn walk<'a>(p: &'a Production, out: &mut Vec<&'a str>) {
1793        match p {
1794            Production::Symbol { name } => out.push(name.as_str()),
1795            Production::Choice { members } | Production::Seq { members } => {
1796                for m in members {
1797                    walk(m, out);
1798                }
1799            }
1800            Production::Alias {
1801                content,
1802                named,
1803                value,
1804            } => {
1805                // A named ALIAS produces a child vertex whose kind is
1806                // the alias `value` (e.g. `ALIAS { content: STRING "=",
1807                // value: "punctuation", named: true }` introduces a
1808                // `punctuation` child). For cursor-driven dispatch to
1809                // recognise alts that emit such children, yield the
1810                // alias value as a referenced symbol. Anonymous aliases
1811                // do not introduce a named node and only need their
1812                // inner content's symbols.
1813                if *named && !value.is_empty() {
1814                    out.push(value.as_str());
1815                }
1816                walk(content, out);
1817            }
1818            Production::Repeat { content }
1819            | Production::Repeat1 { content }
1820            | Production::Optional { content }
1821            | Production::Field { content, .. }
1822            | Production::Token { content }
1823            | Production::ImmediateToken { content }
1824            | Production::Prec { content, .. }
1825            | Production::PrecLeft { content, .. }
1826            | Production::PrecRight { content, .. }
1827            | Production::PrecDynamic { content, .. }
1828            | Production::Reserved { content, .. } => walk(content, out),
1829            _ => {}
1830        }
1831    }
1832    walk(production, &mut out);
1833    out
1834}
1835
1836#[cfg(test)]
1837fn first_symbol(production: &Production) -> Option<&str> {
1838    match production {
1839        Production::Symbol { name } => Some(name),
1840        Production::Seq { members } => members.iter().find_map(first_symbol),
1841        Production::Choice { members } => members.iter().find_map(first_symbol),
1842        Production::Repeat { content }
1843        | Production::Repeat1 { content }
1844        | Production::Optional { content }
1845        | Production::Field { content, .. }
1846        | Production::Alias { content, .. }
1847        | Production::Token { content }
1848        | Production::ImmediateToken { content }
1849        | Production::Prec { content, .. }
1850        | Production::PrecLeft { content, .. }
1851        | Production::PrecRight { content, .. }
1852        | Production::PrecDynamic { content, .. }
1853        | Production::Reserved { content, .. } => first_symbol(content),
1854        _ => None,
1855    }
1856}
1857
1858fn has_field_in(production: &Production, edge_kinds: &[&str]) -> bool {
1859    match production {
1860        Production::Field { name, .. } => edge_kinds.contains(&name.as_str()),
1861        Production::Seq { members } | Production::Choice { members } => {
1862            members.iter().any(|m| has_field_in(m, edge_kinds))
1863        }
1864        Production::Repeat { content }
1865        | Production::Repeat1 { content }
1866        | Production::Optional { content }
1867        | Production::Alias { content, .. }
1868        | Production::Token { content }
1869        | Production::ImmediateToken { content }
1870        | Production::Prec { content, .. }
1871        | Production::PrecLeft { content, .. }
1872        | Production::PrecRight { content, .. }
1873        | Production::PrecDynamic { content, .. }
1874        | Production::Reserved { content, .. } => has_field_in(content, edge_kinds),
1875        _ => false,
1876    }
1877}
1878
1879fn has_relevant_constraint(
1880    production: &Production,
1881    schema: &Schema,
1882    vertex_id: &panproto_gat::Name,
1883) -> bool {
1884    let constraints = match schema.constraints.get(vertex_id) {
1885        Some(c) => c,
1886        None => return false,
1887    };
1888    fn walk(production: &Production, constraints: &[panproto_schema::Constraint]) -> bool {
1889        match production {
1890            Production::String { value } => constraints
1891                .iter()
1892                .any(|c| c.value == *value || c.sort.as_ref() == value),
1893            Production::Field { name, content } => {
1894                constraints.iter().any(|c| c.sort.as_ref() == name) || walk(content, constraints)
1895            }
1896            Production::Seq { members } | Production::Choice { members } => {
1897                members.iter().any(|m| walk(m, constraints))
1898            }
1899            Production::Repeat { content }
1900            | Production::Repeat1 { content }
1901            | Production::Optional { content }
1902            | Production::Alias { content, .. }
1903            | Production::Token { content }
1904            | Production::ImmediateToken { content }
1905            | Production::Prec { content, .. }
1906            | Production::PrecLeft { content, .. }
1907            | Production::PrecRight { content, .. }
1908            | Production::PrecDynamic { content, .. }
1909            | Production::Reserved { content, .. } => walk(content, constraints),
1910            _ => false,
1911        }
1912    }
1913    walk(production, constraints)
1914}
1915
1916fn children_for<'a>(schema: &'a Schema, vertex_id: &panproto_gat::Name) -> Vec<&'a Edge> {
1917    // Walk `outgoing` (insertion-ordered by SchemaBuilder via SmallVec
1918    // append) rather than the unordered `edges` HashMap so abstract
1919    // schemas under REPEAT(CHOICE(...)) preserve the order their edges
1920    // were inserted in. The previous implementation walked the HashMap
1921    // and sorted lexicographically by (kind, target id), which fused
1922    // interleaved children of the same kind into runs (e.g. a sequence
1923    // [symbol, punct, int, symbol, punct, int] became [symbol, symbol,
1924    // punct, punct, int, int] after the lex sort).
1925    let Some(edges) = schema.outgoing.get(vertex_id) else {
1926        return Vec::new();
1927    };
1928
1929    // Look up the canonical Edge reference (the key in `schema.edges`)
1930    // for each entry in `outgoing`. Falls back to the SmallVec entry if
1931    // the canonical key is missing, which would indicate index drift.
1932    let mut indexed: Vec<(usize, u32, &Edge)> = edges
1933        .iter()
1934        .enumerate()
1935        .map(|(i, e)| {
1936            let canonical = schema.edges.get_key_value(e).map_or(e, |(k, _)| k);
1937            let pos = schema.orderings.get(canonical).copied().unwrap_or(u32::MAX);
1938            (i, pos, canonical)
1939        })
1940        .collect();
1941
1942    // Stable sort by (explicit-ordering, insertion-index). Edges with
1943    // an explicit `orderings` entry come first in their declared order;
1944    // the remainder fall through in insertion order.
1945    indexed.sort_by_key(|(i, pos, _)| (*pos, *i));
1946    indexed.into_iter().map(|(_, _, e)| e).collect()
1947}
1948
1949fn vertex_id_kind<'a>(schema: &'a Schema, vertex_id: &panproto_gat::Name) -> Option<&'a str> {
1950    schema.vertices.get(vertex_id).map(|v| v.kind.as_ref())
1951}
1952
1953fn literal_value<'a>(schema: &'a Schema, vertex_id: &panproto_gat::Name) -> Option<&'a str> {
1954    schema
1955        .constraints
1956        .get(vertex_id)?
1957        .iter()
1958        .find(|c| c.sort.as_ref() == "literal-value")
1959        .map(|c| c.value.as_str())
1960}
1961
1962/// True iff `pattern` matches a (possibly optional / repeated) sequence
1963/// of carriage-return and newline characters only. Examples: `\r?\n`,
1964/// `\n`, `\r\n`, `\n+`, `\r?\n+`. Distinguishes structural newline
1965/// terminals from generic whitespace and from other patterns that
1966/// happen to contain a newline escape inside a larger class.
1967fn is_newline_like_pattern(pattern: &str) -> bool {
1968    if pattern.is_empty() {
1969        return false;
1970    }
1971    let mut chars = pattern.chars();
1972    let mut saw_newline_atom = false;
1973    while let Some(c) = chars.next() {
1974        match c {
1975            '\\' => match chars.next() {
1976                Some('n' | 'r') => saw_newline_atom = true,
1977                _ => return false,
1978            },
1979            '?' | '*' | '+' => {} // quantifiers on the previous atom
1980            _ => return false,
1981        }
1982    }
1983    saw_newline_atom
1984}
1985
1986/// True iff `pattern` matches a (possibly quantified) run of generic
1987/// whitespace characters: `\s+`, `[ \t]+`, ` +`, `\s*`. Such patterns
1988/// describe interstitial spacing rather than syntactic content, so the
1989/// pretty emitter can drop them and let the layout pass insert the
1990/// configured separator.
1991fn is_whitespace_only_pattern(pattern: &str) -> bool {
1992    if pattern.is_empty() {
1993        return false;
1994    }
1995    // Strip an outer quantifier suffix.
1996    let trimmed = pattern.trim_end_matches(['?', '*', '+']);
1997    if trimmed.is_empty() {
1998        return false;
1999    }
2000    // Bare `\s` / ` ` / `\t`.
2001    if matches!(trimmed, "\\s" | " " | "\\t") {
2002        return true;
2003    }
2004    // Character class containing only whitespace atoms.
2005    if let Some(inner) = trimmed.strip_prefix('[').and_then(|s| s.strip_suffix(']')) {
2006        let mut chars = inner.chars();
2007        let mut saw_atom = false;
2008        while let Some(c) = chars.next() {
2009            match c {
2010                '\\' => match chars.next() {
2011                    Some('s' | 't' | 'r' | 'n') => saw_atom = true,
2012                    _ => return false,
2013                },
2014                ' ' | '\t' => saw_atom = true,
2015                _ => return false,
2016            }
2017        }
2018        return saw_atom;
2019    }
2020    false
2021}
2022
2023fn placeholder_for_pattern(pattern: &str) -> String {
2024    // Heuristic placeholder for unconstrained PATTERN terminals.
2025    //
2026    // First handle the "the regex IS a literal escape" cases that
2027    // tree-sitter grammars use as separators (`\n`, `\r\n`, `;`,
2028    // etc.); emitting the matching character is always preferable
2029    // to a `_x` identifier-like placeholder when the surrounding
2030    // grammar expects a separator.
2031    let simple_lit = decode_simple_pattern_literal(pattern);
2032    if let Some(lit) = simple_lit {
2033        return lit;
2034    }
2035
2036    if pattern.contains("[0-9]") || pattern.contains("\\d") {
2037        "0".into()
2038    } else if pattern.contains("[a-zA-Z_]") || pattern.contains("\\w") {
2039        "_x".into()
2040    } else if pattern.contains('"') || pattern.contains('\'') {
2041        "\"\"".into()
2042    } else {
2043        "_".into()
2044    }
2045}
2046
2047/// Decode a tree-sitter PATTERN whose regex is a simple literal
2048/// (newline, semicolon, comma, etc.) to the byte sequence it matches.
2049/// Returns `None` for patterns with character classes, alternations,
2050/// or quantifiers; the caller falls back to the heuristic placeholder.
2051fn decode_simple_pattern_literal(pattern: &str) -> Option<String> {
2052    // Skip patterns containing regex metachars that would broaden the
2053    // match beyond a single literal byte sequence.
2054    if pattern
2055        .chars()
2056        .any(|c| matches!(c, '[' | ']' | '(' | ')' | '*' | '+' | '?' | '|' | '{' | '}'))
2057    {
2058        return None;
2059    }
2060    let mut out = String::new();
2061    let mut chars = pattern.chars();
2062    while let Some(c) = chars.next() {
2063        if c == '\\' {
2064            match chars.next() {
2065                Some('n') => out.push('\n'),
2066                Some('r') => out.push('\r'),
2067                Some('t') => out.push('\t'),
2068                Some('\\') => out.push('\\'),
2069                Some('/') => out.push('/'),
2070                Some(other) => out.push(other),
2071                None => return None,
2072            }
2073        } else {
2074            out.push(c);
2075        }
2076    }
2077    Some(out)
2078}
2079
2080// ═══════════════════════════════════════════════════════════════════
2081// Token list output with Spacing algebra
2082// ═══════════════════════════════════════════════════════════════════
2083//
2084// Emit produces a free monoid over `Token`. Layout (spaces, newlines,
2085// indentation) is a homomorphism `Vec<Token> -> Vec<u8>` parameterised
2086// by `FormatPolicy`. Separating the structural output from the layout
2087// decision means each phase has one job: emit walks the grammar and
2088// pushes tokens; layout is a single fold, locally driven by adjacent
2089// pairs and a depth counter. Snapshot/restore is just `tokens.len()`.
2090
2091#[derive(Clone)]
2092enum Token {
2093    /// A user-visible terminal contributed by the grammar.
2094    Lit(String),
2095    /// `indent_open` marker emitted when a `Lit` matched the policy's
2096    /// open list. Carried as a separate token so layout can decide to
2097    /// break + indent without re-scanning.
2098    IndentOpen,
2099    /// `indent_close` marker emitted before a closer-`Lit`.
2100    IndentClose,
2101    /// "Break a line here if not already at line start" — used after
2102    /// statements/declarations and after open braces.
2103    LineBreak,
2104    /// Suppress the next inter-Lit separator. Pushed by the REPEAT
2105    /// walker when an iteration's "separator slot" (a CHOICE-with-BLANK
2106    /// or OPTIONAL at SEQ position 0) emitted zero content tokens, so
2107    /// the categorical reading is "no source-level separator existed
2108    /// between these two sibling iterations of the body".
2109    NoSpace,
2110}
2111
2112struct Output<'a> {
2113    tokens: Vec<Token>,
2114    policy: &'a FormatPolicy,
2115}
2116
2117#[derive(Clone)]
2118struct OutputSnapshot {
2119    tokens_len: usize,
2120}
2121
2122impl<'a> Output<'a> {
2123    fn new(policy: &'a FormatPolicy) -> Self {
2124        Self {
2125            tokens: Vec::new(),
2126            policy,
2127        }
2128    }
2129
2130    fn token(&mut self, value: &str) {
2131        if value.is_empty() {
2132            return;
2133        }
2134
2135        // A grammar STRING whose value is a newline (e.g. abc's `_NL = "\n"`
2136        // or any rule that uses `"\n"` as a structural line terminator)
2137        // must route through the layout's `LineBreak` channel. Emitting it
2138        // as a `Lit` leaves the newline character in the byte stream but
2139        // also makes `needs_space_between` insert the configured separator
2140        // between the newline and the following token, producing leading
2141        // spaces on every line after the first.
2142        if value == "\n" || value == "\r\n" || value == "\r" {
2143            self.tokens.push(Token::LineBreak);
2144            return;
2145        }
2146
2147        // A captured literal value (typically a vertex's `literal-value`
2148        // constraint covering the full source span of a terminal-like
2149        // rule, e.g. abc's `reference_number_line` matching `"X:1\n"`)
2150        // may contain trailing newlines. Splitting the trailing newlines
2151        // off as a `LineBreak` lets the layout pass treat the next Lit
2152        // as starting a new line; otherwise the next Lit pair would
2153        // trigger `needs_space_between` against the embedded `\n` and
2154        // insert the policy separator at column 0 of the new line.
2155        let trimmed = value.trim_end_matches(['\n', '\r']);
2156        let trailing_newlines = value.len() - trimmed.len();
2157        if trailing_newlines > 0 && !trimmed.is_empty() {
2158            if self.policy.indent_close.iter().any(|t| t == trimmed) {
2159                self.tokens.push(Token::IndentClose);
2160            }
2161            self.tokens.push(Token::Lit(trimmed.to_owned()));
2162            if self.policy.indent_open.iter().any(|t| t == trimmed) {
2163                self.tokens.push(Token::IndentOpen);
2164            } else if self.policy.line_break_after.iter().any(|t| t == trimmed) {
2165                // already emitting a LineBreak below for the trailing \n
2166            }
2167            self.tokens.push(Token::LineBreak);
2168            return;
2169        }
2170
2171        if self.policy.indent_close.iter().any(|t| t == value) {
2172            self.tokens.push(Token::IndentClose);
2173        }
2174
2175        self.tokens.push(Token::Lit(value.to_owned()));
2176
2177        if self.policy.indent_open.iter().any(|t| t == value) {
2178            self.tokens.push(Token::IndentOpen);
2179            self.tokens.push(Token::LineBreak);
2180        } else if self.policy.line_break_after.iter().any(|t| t == value) {
2181            self.tokens.push(Token::LineBreak);
2182        }
2183    }
2184
2185    fn newline(&mut self) {
2186        self.tokens.push(Token::LineBreak);
2187    }
2188
2189    /// Open an indent scope: subsequent `LineBreak`s render at the
2190    /// new depth until a matching `indent_close` pops it. Used by the
2191    /// external-token fallback to render indent-based grammars'
2192    /// `_indent` scanner outputs.
2193    fn indent_open(&mut self) {
2194        self.tokens.push(Token::IndentOpen);
2195        self.tokens.push(Token::LineBreak);
2196    }
2197
2198    /// Close one indent scope opened by `indent_open`.
2199    fn indent_close(&mut self) {
2200        self.tokens.push(Token::IndentClose);
2201    }
2202
2203    fn snapshot(&self) -> OutputSnapshot {
2204        OutputSnapshot {
2205            tokens_len: self.tokens.len(),
2206        }
2207    }
2208
2209    fn restore(&mut self, snap: OutputSnapshot) {
2210        self.tokens.truncate(snap.tokens_len);
2211    }
2212
2213    /// True iff at least one `Token::Lit` was pushed since `snap`.
2214    /// Control-only emissions (`LineBreak`, `IndentOpen` / `IndentClose`,
2215    /// `NoSpace`) do not count as content. Used by the REPEAT walker
2216    /// to detect that a "separator slot" CHOICE picked its BLANK
2217    /// alternative, so the next iteration's content can be marked
2218    /// tight against the previous iteration's content.
2219    fn lit_emitted_since(&self, snap: OutputSnapshot) -> bool {
2220        self.tokens[snap.tokens_len..]
2221            .iter()
2222            .any(|t| matches!(t, Token::Lit(_)))
2223    }
2224
2225    /// Push a marker that suppresses the next inter-Lit separator the
2226    /// layout pass would otherwise insert. Used to encode "no source-
2227    /// level separator was emitted between these two Lits" without
2228    /// having to make per-grammar adjacency decisions in the layout.
2229    fn no_space(&mut self) {
2230        self.tokens.push(Token::NoSpace);
2231    }
2232
2233    fn finish(self) -> Vec<u8> {
2234        layout(&self.tokens, self.policy)
2235    }
2236}
2237
2238/// Fold a token list into bytes. The algebra:
2239/// * adjacent `Lit`s get a single space iff `needs_space_between(a, b)`,
2240/// * `IndentOpen` / `IndentClose` adjust a depth counter,
2241/// * `LineBreak` writes `\n` if not already at line start, then the
2242///   next `Lit` writes `indent * indent_width` spaces of indent.
2243fn layout(tokens: &[Token], policy: &FormatPolicy) -> Vec<u8> {
2244    let mut bytes = Vec::new();
2245    let mut indent: usize = 0;
2246    let mut at_line_start = true;
2247    let mut last_lit: Option<&str> = None;
2248    // True iff, at the moment `last_lit` was emitted, the cursor was at a
2249    // position where the grammar expects an operand: start of stream / line,
2250    // just after an open paren / bracket / brace, just after a separator like
2251    // `,` or `;`, or just after a binary / assignment operator. Used by
2252    // `needs_space_between` to recognise `last_lit` as a tight unary prefix
2253    // (`f(-1.0)`) rather than a spaced binary operator (`a - b`).
2254    let mut last_was_in_operand_position = true;
2255    let mut expecting_operand = true;
2256    // Set when a `Token::NoSpace` marker is seen; cleared when the next
2257    // Lit consumes it. While set, suppress the policy separator that
2258    // would otherwise be inserted before the next Lit.
2259    let mut suppress_next_separator = false;
2260    let newline = policy.newline.as_bytes();
2261    let separator = policy.separator.as_bytes();
2262
2263    for tok in tokens {
2264        match tok {
2265            Token::IndentOpen => indent += 1,
2266            Token::IndentClose => {
2267                indent = indent.saturating_sub(1);
2268                if !at_line_start {
2269                    bytes.extend_from_slice(newline);
2270                    at_line_start = true;
2271                    expecting_operand = true;
2272                }
2273            }
2274            Token::LineBreak => {
2275                if !at_line_start {
2276                    bytes.extend_from_slice(newline);
2277                    at_line_start = true;
2278                    expecting_operand = true;
2279                }
2280            }
2281            Token::NoSpace => {
2282                suppress_next_separator = true;
2283            }
2284            Token::Lit(value) => {
2285                if at_line_start {
2286                    bytes.extend(std::iter::repeat_n(b' ', indent * policy.indent_width));
2287                } else if let Some(prev) = last_lit {
2288                    if !suppress_next_separator
2289                        && needs_space_between(prev, value, last_was_in_operand_position)
2290                    {
2291                        bytes.extend_from_slice(separator);
2292                    }
2293                }
2294                suppress_next_separator = false;
2295                bytes.extend_from_slice(value.as_bytes());
2296                at_line_start = false;
2297                last_was_in_operand_position = expecting_operand;
2298                expecting_operand = leaves_operand_position(value);
2299                last_lit = Some(value.as_str());
2300            }
2301        }
2302    }
2303
2304    if !at_line_start {
2305        bytes.extend_from_slice(newline);
2306    }
2307    bytes
2308}
2309
2310/// True iff emitting `tok` leaves the cursor in a position where the
2311/// grammar expects an operand next. Operand-introducing tokens are open
2312/// punctuation, separators, and operator-like strings; operand-terminating
2313/// tokens are identifiers, literals, and closing punctuation.
2314fn leaves_operand_position(tok: &str) -> bool {
2315    if tok.is_empty() {
2316        return true;
2317    }
2318    if is_punct_open(tok) {
2319        return true;
2320    }
2321    if matches!(tok, "," | ";") {
2322        return true;
2323    }
2324    if is_punct_close(tok) {
2325        return false;
2326    }
2327    if first_is_alnum_or_underscore(tok) || last_ends_with_alnum(tok) {
2328        return false;
2329    }
2330    // Pure punctuation/operator runs (`=`, `+`, `-`, `<=`, `>>`, …) leave
2331    // the cursor expecting another operand.
2332    true
2333}
2334
2335fn needs_space_between(last: &str, next: &str, expecting_operand: bool) -> bool {
2336    if last.is_empty() || next.is_empty() {
2337        return false;
2338    }
2339    if is_punct_open(last) || is_punct_open(next) {
2340        return false;
2341    }
2342    if is_punct_close(next) {
2343        return false;
2344    }
2345    if is_punct_close(last) && is_punct_punctuation(next) {
2346        return false;
2347    }
2348    if last == "." || next == "." {
2349        return false;
2350    }
2351    // Tight unary prefix: `last` is a sign/logical-not operator emitted
2352    // where the grammar expected an operand, so it glues to `next`.
2353    // `expecting_operand` here means: just before `last` was emitted,
2354    // the cursor expected an operand, which makes `last` a unary prefix.
2355    // Examples: `f(-1.0)`, `[ -2, 3 ]`, `return -x`, `a = !flag`.
2356    if expecting_operand && is_unary_prefix_operator(last) && first_is_operand_start(next) {
2357        return false;
2358    }
2359    if last_is_word_like(last) && first_is_word_like(next) {
2360        return true;
2361    }
2362    if last_ends_with_alnum(last) && first_is_alnum_or_underscore(next) {
2363        return true;
2364    }
2365    // Adjacent operator runs: keep them apart so the lexer doesn't glue
2366    // `>` and `=` into `>=` unintentionally.
2367    true
2368}
2369
2370fn is_unary_prefix_operator(s: &str) -> bool {
2371    matches!(s, "-" | "+" | "!" | "~")
2372}
2373
2374fn first_is_operand_start(s: &str) -> bool {
2375    s.chars()
2376        .next()
2377        .map(|c| c.is_alphanumeric() || c == '_' || c == '.' || c == '(')
2378        .unwrap_or(false)
2379}
2380
2381fn is_punct_open(s: &str) -> bool {
2382    matches!(s, "(" | "[" | "{" | "\"" | "'" | "`" | "@" | "#")
2383}
2384
2385fn is_punct_close(s: &str) -> bool {
2386    matches!(s, ")" | "]" | "}" | "," | ";" | ":" | "\"" | "'" | "`")
2387}
2388
2389fn is_punct_punctuation(s: &str) -> bool {
2390    matches!(s, "," | ";" | ":" | "." | ")" | "]" | "}")
2391}
2392
2393fn last_is_word_like(s: &str) -> bool {
2394    s.chars()
2395        .next_back()
2396        .map(|c| c.is_alphanumeric() || c == '_')
2397        .unwrap_or(false)
2398}
2399
2400fn first_is_word_like(s: &str) -> bool {
2401    s.chars()
2402        .next()
2403        .map(|c| c.is_alphanumeric() || c == '_')
2404        .unwrap_or(false)
2405}
2406
2407fn last_ends_with_alnum(s: &str) -> bool {
2408    s.chars()
2409        .next_back()
2410        .map(char::is_alphanumeric)
2411        .unwrap_or(false)
2412}
2413
2414fn first_is_alnum_or_underscore(s: &str) -> bool {
2415    s.chars()
2416        .next()
2417        .map(|c| c.is_alphanumeric() || c == '_')
2418        .unwrap_or(false)
2419}
2420
2421#[cfg(test)]
2422mod tests {
2423    use super::*;
2424
2425    #[test]
2426    fn parses_simple_grammar_json() {
2427        let bytes = br#"{
2428            "name": "tiny",
2429            "rules": {
2430                "program": {
2431                    "type": "SEQ",
2432                    "members": [
2433                        {"type": "STRING", "value": "hello"},
2434                        {"type": "STRING", "value": ";"}
2435                    ]
2436                }
2437            }
2438        }"#;
2439        let g = Grammar::from_bytes("tiny", bytes).expect("valid tiny grammar");
2440        assert!(g.rules.contains_key("program"));
2441    }
2442
2443    #[test]
2444    fn output_emits_punctuation_without_leading_space() {
2445        let policy = FormatPolicy::default();
2446        let mut out = Output::new(&policy);
2447        out.token("foo");
2448        out.token("(");
2449        out.token(")");
2450        out.token(";");
2451        let bytes = out.finish();
2452        let s = std::str::from_utf8(&bytes).expect("ascii output");
2453        assert!(s.starts_with("foo();"), "got {s:?}");
2454    }
2455
2456    #[test]
2457    fn grammar_from_bytes_rejects_malformed_input() {
2458        let result = Grammar::from_bytes("malformed", b"not json");
2459        let err = result.expect_err("malformed bytes must yield Err");
2460        let msg = err.to_string();
2461        assert!(
2462            msg.contains("malformed"),
2463            "error message should name the protocol: {msg:?}"
2464        );
2465    }
2466
2467    #[test]
2468    fn output_indents_after_open_brace() {
2469        let policy = FormatPolicy::default();
2470        let mut out = Output::new(&policy);
2471        out.token("fn");
2472        out.token("foo");
2473        out.token("(");
2474        out.token(")");
2475        out.token("{");
2476        out.token("body");
2477        out.token("}");
2478        let bytes = out.finish();
2479        let s = std::str::from_utf8(&bytes).expect("ascii output");
2480        assert!(s.contains("{\n"), "newline after opening brace: {s:?}");
2481        assert!(s.contains("body"), "body inside block: {s:?}");
2482        assert!(s.ends_with("}\n"), "newline after closing brace: {s:?}");
2483    }
2484
2485    #[test]
2486    fn output_no_space_between_word_and_dot() {
2487        let policy = FormatPolicy::default();
2488        let mut out = Output::new(&policy);
2489        out.token("foo");
2490        out.token(".");
2491        out.token("bar");
2492        let bytes = out.finish();
2493        let s = std::str::from_utf8(&bytes).expect("ascii output");
2494        assert!(s.starts_with("foo.bar"), "no space around dot: {s:?}");
2495    }
2496
2497    #[test]
2498    fn output_snapshot_restore_truncates_bytes() {
2499        let policy = FormatPolicy::default();
2500        let mut out = Output::new(&policy);
2501        out.token("keep");
2502        let snap = out.snapshot();
2503        out.token("drop");
2504        out.token("more");
2505        out.restore(snap);
2506        out.token("after");
2507        let bytes = out.finish();
2508        let s = std::str::from_utf8(&bytes).expect("ascii output");
2509        assert!(s.contains("keep"), "kept token survives: {s:?}");
2510        assert!(s.contains("after"), "post-restore token visible: {s:?}");
2511        assert!(!s.contains("drop"), "rolled-back token removed: {s:?}");
2512        assert!(!s.contains("more"), "rolled-back token removed: {s:?}");
2513    }
2514
2515    #[test]
2516    fn child_cursor_take_field_consumes_once() {
2517        let edges_owned: Vec<Edge> = vec![Edge {
2518            src: panproto_gat::Name::from("p"),
2519            tgt: panproto_gat::Name::from("c"),
2520            kind: panproto_gat::Name::from("name"),
2521            name: None,
2522        }];
2523        let edges: Vec<&Edge> = edges_owned.iter().collect();
2524        let mut cursor = ChildCursor::new(&edges);
2525        let first = cursor.take_field("name");
2526        let second = cursor.take_field("name");
2527        assert!(first.is_some(), "first take returns the edge");
2528        assert!(
2529            second.is_none(),
2530            "second take returns None (already consumed)"
2531        );
2532    }
2533
2534    #[test]
2535    fn child_cursor_take_matching_predicate() {
2536        let edges_owned: Vec<Edge> = vec![
2537            Edge {
2538                src: "p".into(),
2539                tgt: "c1".into(),
2540                kind: "child_of".into(),
2541                name: None,
2542            },
2543            Edge {
2544                src: "p".into(),
2545                tgt: "c2".into(),
2546                kind: "key".into(),
2547                name: None,
2548            },
2549        ];
2550        let edges: Vec<&Edge> = edges_owned.iter().collect();
2551        let mut cursor = ChildCursor::new(&edges);
2552        assert!(cursor.has_matching(|e| e.kind.as_ref() == "key"));
2553        let taken = cursor.take_matching(|e| e.kind.as_ref() == "key");
2554        assert!(taken.is_some());
2555        assert!(
2556            !cursor.has_matching(|e| e.kind.as_ref() == "key"),
2557            "consumed edge no longer matches"
2558        );
2559        assert!(
2560            cursor.has_matching(|e| e.kind.as_ref() == "child_of"),
2561            "the other edge is still available"
2562        );
2563    }
2564
2565    #[test]
2566    fn kind_satisfies_symbol_direct_match() {
2567        let bytes = br#"{
2568            "name": "tiny",
2569            "rules": {
2570                "x": {"type": "STRING", "value": "x"}
2571            }
2572        }"#;
2573        let g = Grammar::from_bytes("tiny", bytes).expect("valid grammar");
2574        assert!(kind_satisfies_symbol(&g, Some("x"), "x"));
2575        assert!(!kind_satisfies_symbol(&g, Some("y"), "x"));
2576        assert!(!kind_satisfies_symbol(&g, None, "x"));
2577    }
2578
2579    #[test]
2580    fn kind_satisfies_symbol_through_hidden_rule() {
2581        let bytes = br#"{
2582            "name": "tiny",
2583            "rules": {
2584                "_value": {
2585                    "type": "CHOICE",
2586                    "members": [
2587                        {"type": "SYMBOL", "name": "object"},
2588                        {"type": "SYMBOL", "name": "number"}
2589                    ]
2590                },
2591                "object": {"type": "STRING", "value": "{}"},
2592                "number": {"type": "PATTERN", "value": "[0-9]+"}
2593            }
2594        }"#;
2595        let g = Grammar::from_bytes("tiny", bytes).expect("valid grammar");
2596        assert!(
2597            kind_satisfies_symbol(&g, Some("number"), "_value"),
2598            "number is reachable from _value via CHOICE"
2599        );
2600        assert!(
2601            kind_satisfies_symbol(&g, Some("object"), "_value"),
2602            "object is reachable from _value via CHOICE"
2603        );
2604        assert!(
2605            !kind_satisfies_symbol(&g, Some("string"), "_value"),
2606            "string is NOT among the alternatives"
2607        );
2608    }
2609
2610    #[test]
2611    fn first_symbol_skips_string_terminals() {
2612        let prod: Production = serde_json::from_str(
2613            r#"{
2614                "type": "SEQ",
2615                "members": [
2616                    {"type": "STRING", "value": "{"},
2617                    {"type": "SYMBOL", "name": "body"},
2618                    {"type": "STRING", "value": "}"}
2619                ]
2620            }"#,
2621        )
2622        .expect("valid SEQ");
2623        assert_eq!(first_symbol(&prod), Some("body"));
2624    }
2625
2626    #[test]
2627    fn placeholder_for_pattern_routes_by_regex_class() {
2628        assert_eq!(placeholder_for_pattern("[0-9]+"), "0");
2629        assert_eq!(placeholder_for_pattern("[a-zA-Z_]\\w*"), "_x");
2630        assert_eq!(placeholder_for_pattern("\"[^\"]*\""), "\"\"");
2631        assert_eq!(placeholder_for_pattern("\\d+\\.\\d+"), "0");
2632    }
2633
2634    #[test]
2635    fn format_policy_default_breaks_after_semicolon() {
2636        let policy = FormatPolicy::default();
2637        assert!(policy.line_break_after.iter().any(|t| t == ";"));
2638        assert!(policy.indent_open.iter().any(|t| t == "{"));
2639        assert!(policy.indent_close.iter().any(|t| t == "}"));
2640        assert_eq!(policy.indent_width, 2);
2641    }
2642
2643    #[test]
2644    fn placeholder_decodes_literal_pattern_separators() {
2645        // PATTERN regexes that match a single literal byte sequence
2646        // (newline, semicolon, comma) emit the bytes verbatim instead
2647        // of falling through to the `_` catch-all.
2648        assert_eq!(placeholder_for_pattern("\\n"), "\n");
2649        assert_eq!(placeholder_for_pattern("\\r\\n"), "\r\n");
2650        assert_eq!(placeholder_for_pattern(";"), ";");
2651        // Patterns with character classes / alternation still route
2652        // through the heuristic.
2653        assert_eq!(placeholder_for_pattern("[0-9]+"), "0");
2654        assert_eq!(placeholder_for_pattern("a|b"), "_");
2655    }
2656
2657    #[test]
2658    fn supertypes_decode_from_grammar_json_strings() {
2659        // Tree-sitter older grammars list supertypes as bare strings.
2660        let bytes = br#"{
2661            "name": "tiny",
2662            "supertypes": ["expression"],
2663            "rules": {
2664                "expression": {
2665                    "type": "CHOICE",
2666                    "members": [
2667                        {"type": "SYMBOL", "name": "binary_expression"},
2668                        {"type": "SYMBOL", "name": "identifier"}
2669                    ]
2670                },
2671                "binary_expression": {"type": "STRING", "value": "x"},
2672                "identifier": {"type": "PATTERN", "value": "[a-z]+"}
2673            }
2674        }"#;
2675        let g = Grammar::from_bytes("tiny", bytes).expect("parse");
2676        assert!(g.supertypes.contains("expression"));
2677        // identifier matches the supertype `expression`.
2678        assert!(kind_satisfies_symbol(&g, Some("identifier"), "expression"));
2679        // unrelated kinds do not.
2680        assert!(!kind_satisfies_symbol(&g, Some("string"), "expression"));
2681    }
2682
2683    #[test]
2684    fn supertypes_decode_from_grammar_json_objects() {
2685        // Recent grammars list supertypes as `{type: SYMBOL, name: ...}`
2686        // entries instead of bare strings.
2687        let bytes = br#"{
2688            "name": "tiny",
2689            "supertypes": [{"type": "SYMBOL", "name": "stmt"}],
2690            "rules": {
2691                "stmt": {
2692                    "type": "CHOICE",
2693                    "members": [
2694                        {"type": "SYMBOL", "name": "while_stmt"},
2695                        {"type": "SYMBOL", "name": "if_stmt"}
2696                    ]
2697                },
2698                "while_stmt": {"type": "STRING", "value": "while"},
2699                "if_stmt": {"type": "STRING", "value": "if"}
2700            }
2701        }"#;
2702        let g = Grammar::from_bytes("tiny", bytes).expect("parse");
2703        assert!(g.supertypes.contains("stmt"));
2704        assert!(kind_satisfies_symbol(&g, Some("while_stmt"), "stmt"));
2705    }
2706
2707    #[test]
2708    fn alias_value_matches_kind() {
2709        // A named ALIAS rewrites the parser-visible kind to `value`;
2710        // `kind_satisfies_symbol` should accept that rewritten kind
2711        // when looking up the original SYMBOL.
2712        let bytes = br#"{
2713            "name": "tiny",
2714            "rules": {
2715                "_package_identifier": {
2716                    "type": "ALIAS",
2717                    "named": true,
2718                    "value": "package_identifier",
2719                    "content": {"type": "SYMBOL", "name": "identifier"}
2720                },
2721                "identifier": {"type": "PATTERN", "value": "[a-z]+"}
2722            }
2723        }"#;
2724        let g = Grammar::from_bytes("tiny", bytes).expect("parse");
2725        assert!(kind_satisfies_symbol(
2726            &g,
2727            Some("package_identifier"),
2728            "_package_identifier"
2729        ));
2730    }
2731
2732    #[test]
2733    fn referenced_symbols_walks_nested_seq() {
2734        let prod: Production = serde_json::from_str(
2735            r#"{
2736                "type": "SEQ",
2737                "members": [
2738                    {"type": "CHOICE", "members": [
2739                        {"type": "SYMBOL", "name": "attribute_item"},
2740                        {"type": "BLANK"}
2741                    ]},
2742                    {"type": "SYMBOL", "name": "parameter"},
2743                    {"type": "REPEAT", "content": {
2744                        "type": "SEQ",
2745                        "members": [
2746                            {"type": "STRING", "value": ","},
2747                            {"type": "SYMBOL", "name": "parameter"}
2748                        ]
2749                    }}
2750                ]
2751            }"#,
2752        )
2753        .expect("seq");
2754        let symbols = referenced_symbols(&prod);
2755        assert!(symbols.contains(&"attribute_item"));
2756        assert!(symbols.contains(&"parameter"));
2757    }
2758
2759    #[test]
2760    fn literal_strings_collects_choice_members() {
2761        let prod: Production = serde_json::from_str(
2762            r#"{
2763                "type": "CHOICE",
2764                "members": [
2765                    {"type": "STRING", "value": "+"},
2766                    {"type": "STRING", "value": "-"},
2767                    {"type": "STRING", "value": "*"}
2768                ]
2769            }"#,
2770        )
2771        .expect("choice");
2772        let strings = literal_strings(&prod);
2773        assert_eq!(strings, vec!["+", "-", "*"]);
2774    }
2775
2776    /// The ocaml and javascript grammars (tree-sitter ≥ 0.25) emit a
2777    /// `RESERVED` rule kind that an earlier deserialiser rejected
2778    /// with `unknown variant "RESERVED"`. Verify both that the bare
2779    /// variant deserialises and that a `RESERVED`-wrapped grammar is
2780    /// loadable end-to-end via [`Grammar::from_bytes`].
2781    #[test]
2782    fn reserved_variant_deserialises() {
2783        let prod: Production = serde_json::from_str(
2784            r#"{
2785                "type": "RESERVED",
2786                "content": {"type": "SYMBOL", "name": "_lowercase_identifier"},
2787                "context_name": "attribute_id"
2788            }"#,
2789        )
2790        .expect("RESERVED parses");
2791        match prod {
2792            Production::Reserved { content, .. } => match *content {
2793                Production::Symbol { name } => assert_eq!(name, "_lowercase_identifier"),
2794                other => panic!("expected inner SYMBOL, got {other:?}"),
2795            },
2796            other => panic!("expected RESERVED, got {other:?}"),
2797        }
2798    }
2799
2800    #[test]
2801    fn reserved_grammar_loads_end_to_end() {
2802        let bytes = br#"{
2803            "name": "tiny_reserved",
2804            "rules": {
2805                "program": {
2806                    "type": "RESERVED",
2807                    "content": {"type": "SYMBOL", "name": "ident"},
2808                    "context_name": "keywords"
2809                },
2810                "ident": {"type": "PATTERN", "value": "[a-z]+"}
2811            }
2812        }"#;
2813        let g = Grammar::from_bytes("tiny_reserved", bytes).expect("RESERVED-using grammar loads");
2814        assert!(g.rules.contains_key("program"));
2815    }
2816
2817    #[test]
2818    fn reserved_walker_helpers_recurse_into_content() {
2819        // The walker's helpers (first_symbol, has_field_in,
2820        // referenced_symbols, ...) all need to descend through
2821        // RESERVED into its content. If they bail at RESERVED, the
2822        // `pick_choice_with_cursor` heuristic ranks the alt below
2823        // alts that DO recurse, which produces wrong emit output
2824        // even when the deserialiser doesn't crash.
2825        let prod: Production = serde_json::from_str(
2826            r#"{
2827                "type": "RESERVED",
2828                "content": {
2829                    "type": "FIELD",
2830                    "name": "lhs",
2831                    "content": {"type": "SYMBOL", "name": "expr"}
2832                },
2833                "context_name": "ctx"
2834            }"#,
2835        )
2836        .expect("nested RESERVED parses");
2837        assert_eq!(first_symbol(&prod), Some("expr"));
2838        assert!(has_field_in(&prod, &["lhs"]));
2839        let symbols = referenced_symbols(&prod);
2840        assert!(symbols.contains(&"expr"));
2841    }
2842}