Skip to main content

panproto_parse/
emit_pretty.rs

1#![allow(
2    clippy::module_name_repetitions,
3    clippy::too_many_lines,
4    clippy::too_many_arguments,
5    clippy::map_unwrap_or,
6    clippy::option_if_let_else,
7    clippy::elidable_lifetime_names,
8    clippy::items_after_statements,
9    clippy::needless_pass_by_value,
10    clippy::single_match_else,
11    clippy::manual_let_else,
12    clippy::match_same_arms,
13    clippy::missing_const_for_fn,
14    clippy::single_char_pattern,
15    clippy::naive_bytecount,
16    clippy::expect_used,
17    clippy::redundant_pub_crate,
18    clippy::used_underscore_binding,
19    clippy::redundant_field_names,
20    clippy::struct_field_names,
21    clippy::redundant_else,
22    clippy::similar_names
23)]
24
25//! De-novo source emission from a by-construction schema.
26//!
27//! [`AstParser::emit`] reconstructs source from byte-position fragments
28//! that the parser stored on the schema during `parse`. That works for
29//! edit pipelines (`parse → transform → emit`) but fails for schemas
30//! built by hand (`SchemaBuilder` with no parse history): they carry
31//! no `start-byte`, no `interstitial-N`, no `literal-value`, and the
32//! reconstructor returns `Err(EmitFailed { reason: "schema has no
33//! text fragments" })`.
34//!
35//! This module renders such schemas to source bytes by walking
36//! tree-sitter's `grammar.json` production rules. For each schema
37//! vertex of kind `K`, the walker looks up `K`'s production in the
38//! grammar and emits its body in order:
39//!
40//! - `STRING` nodes contribute literal token bytes directly.
41//! - `SYMBOL` and `FIELD` nodes recurse into the schema's children,
42//!   matching by edge kind (which is the tree-sitter field name).
43//! - `SEQ` emits its members in order.
44//! - `CHOICE` picks the alternative whose head `SYMBOL` matches an
45//!   actual child kind, or whose terminals appear in the rendered
46//!   prefix; falls back to the first non-`BLANK` alternative when no
47//!   alternative matches.
48//! - `REPEAT` and `REPEAT1` emit their content once per matching
49//!   child edge in declared order.
50//! - `OPTIONAL` emits its content iff a corresponding child edge or
51//!   constraint is populated.
52//! - `PATTERN` is a regex placeholder for variable-text terminals
53//!   (identifiers, numbers, quoted strings). The walker emits a
54//!   `literal-value` constraint when present and otherwise falls
55//!   back to a placeholder derived from the regex shape.
56//! - `BLANK`, `TOKEN`, `IMMEDIATE_TOKEN`, `ALIAS`, `PREC*` are
57//!   handled transparently (the inner content is emitted; the
58//!   wrapper is dropped).
59//!
60//! Whitespace and indentation come from a `FormatPolicy` applied
61//! during emission. The default policy inserts a single space between
62//! adjacent tokens, a newline after `;` / `}` / `{`, and tracks an
63//! indent counter on `{` / `}` boundaries.
64//!
65//! Output is *syntactically valid* for any grammar that ships
66//! `grammar.json`. Idiomatic formatting (rustfmt-style spacing rules,
67//! per-language conventions) is a polish layer that lives outside
68//! this module.
69
70use std::collections::BTreeMap;
71
72use panproto_schema::{Edge, Schema};
73use serde::Deserialize;
74
75use crate::error::ParseError;
76
77// ═══════════════════════════════════════════════════════════════════
78// Grammar JSON model
79// ═══════════════════════════════════════════════════════════════════
80
81/// A single tree-sitter production rule.
82///
83/// Mirrors the shape emitted by `tree-sitter generate`: every node has
84/// a `type` discriminator that selects a structural variant. The
85/// untyped subset (`PATTERN`, `STRING`, `SYMBOL`, `BLANK`) handles
86/// terminals; the structural subset (`SEQ`, `CHOICE`, `REPEAT`,
87/// `REPEAT1`, `OPTIONAL`, `FIELD`, `ALIAS`, `TOKEN`,
88/// `IMMEDIATE_TOKEN`, `PREC*`) builds composite productions.
89#[derive(Debug, Clone, Deserialize)]
90#[serde(tag = "type")]
91#[non_exhaustive]
92pub enum Production {
93    /// Concatenation of productions.
94    #[serde(rename = "SEQ")]
95    Seq {
96        /// Ordered members; each is emitted in turn.
97        members: Vec<Self>,
98    },
99    /// Alternation between productions.
100    #[serde(rename = "CHOICE")]
101    Choice {
102        /// Alternatives; the walker picks one based on the schema's
103        /// children and constraints.
104        members: Vec<Self>,
105    },
106    /// Zero-or-more repetition.
107    #[serde(rename = "REPEAT")]
108    Repeat {
109        /// The repeated body.
110        content: Box<Self>,
111    },
112    /// One-or-more repetition.
113    #[serde(rename = "REPEAT1")]
114    Repeat1 {
115        /// The repeated body.
116        content: Box<Self>,
117    },
118    /// Optional inclusion (zero or one).
119    ///
120    /// Tree-sitter usually emits `OPTIONAL` as `CHOICE { content,
121    /// BLANK }`, but recent generator versions also emit explicit
122    /// `OPTIONAL` nodes; both shapes are accepted.
123    #[serde(rename = "OPTIONAL")]
124    Optional {
125        /// The optional body.
126        content: Box<Self>,
127    },
128    /// Reference to another rule by name.
129    #[serde(rename = "SYMBOL")]
130    Symbol {
131        /// Name of the referenced rule (matches a vertex kind on the
132        /// schema side).
133        name: String,
134    },
135    /// Literal token bytes.
136    #[serde(rename = "STRING")]
137    String {
138        /// The literal token. Emitted verbatim.
139        value: String,
140    },
141    /// Regex-matched terminal.
142    ///
143    /// At parse time this matches arbitrary bytes; at emit time the
144    /// walker substitutes a `literal-value` constraint when present
145    /// and falls back to a placeholder otherwise.
146    #[serde(rename = "PATTERN")]
147    Pattern {
148        /// The original regex.
149        value: String,
150    },
151    /// The empty production. Emits nothing.
152    #[serde(rename = "BLANK")]
153    Blank,
154    /// Named field over a content production.
155    ///
156    /// The field `name` matches an edge kind on the schema side; the
157    /// walker resolves the corresponding child vertex and recurses
158    /// into `content` with that child as context.
159    #[serde(rename = "FIELD")]
160    Field {
161        /// Field name (matches edge kind).
162        name: String,
163        /// The contents of the field.
164        content: Box<Self>,
165    },
166    /// An aliased production.
167    ///
168    /// `value` records the parser-visible kind; the walker emits
169    /// `content` and ignores the alias rename.
170    #[serde(rename = "ALIAS")]
171    Alias {
172        /// The aliased content.
173        content: Box<Self>,
174        /// Whether the alias is a named node.
175        #[serde(default)]
176        named: bool,
177        /// The alias's surface name.
178        #[serde(default)]
179        value: String,
180    },
181    /// A token wrapper.
182    ///
183    /// Tree-sitter uses `TOKEN` to mark a sub-rule as a single
184    /// lexical token; the walker emits the inner content unchanged.
185    #[serde(rename = "TOKEN")]
186    Token {
187        /// The wrapped content.
188        content: Box<Self>,
189    },
190    /// An immediate-token wrapper (no preceding whitespace).
191    ///
192    /// Treated like [`Production::Token`] for emit purposes.
193    #[serde(rename = "IMMEDIATE_TOKEN")]
194    ImmediateToken {
195        /// The wrapped content.
196        content: Box<Self>,
197    },
198    /// Precedence wrapper.
199    #[serde(rename = "PREC")]
200    Prec {
201        /// Precedence value (numeric or string). Ignored at emit time.
202        #[allow(dead_code)]
203        value: serde_json::Value,
204        /// The wrapped content.
205        content: Box<Self>,
206    },
207    /// Left-associative precedence wrapper.
208    #[serde(rename = "PREC_LEFT")]
209    PrecLeft {
210        /// Precedence value. Ignored at emit time.
211        #[allow(dead_code)]
212        value: serde_json::Value,
213        /// The wrapped content.
214        content: Box<Self>,
215    },
216    /// Right-associative precedence wrapper.
217    #[serde(rename = "PREC_RIGHT")]
218    PrecRight {
219        /// Precedence value. Ignored at emit time.
220        #[allow(dead_code)]
221        value: serde_json::Value,
222        /// The wrapped content.
223        content: Box<Self>,
224    },
225    /// Dynamic precedence wrapper.
226    #[serde(rename = "PREC_DYNAMIC")]
227    PrecDynamic {
228        /// Precedence value. Ignored at emit time.
229        #[allow(dead_code)]
230        value: serde_json::Value,
231        /// The wrapped content.
232        content: Box<Self>,
233    },
234    /// Reserved-word wrapper (tree-sitter ≥ 0.25).
235    ///
236    /// Tree-sitter's `RESERVED` rule marks an inner production as a
237    /// reserved-word context: the parser excludes the listed identifiers
238    /// from being treated as the inner symbol. The `context_name`
239    /// metadata names the reserved-word set; the emitter does not need
240    /// it (we are walking schema → bytes, not enforcing reserved-word
241    /// constraints), so we emit the inner content unchanged, the same
242    /// way [`Production::Token`] and [`Production::ImmediateToken`] do.
243    #[serde(rename = "RESERVED")]
244    Reserved {
245        /// The wrapped content.
246        content: Box<Self>,
247        /// Name of the reserved-word context. Ignored at emit time.
248        #[allow(dead_code)]
249        #[serde(default)]
250        context_name: String,
251    },
252}
253
254/// A grammar's production-rule table, deserialized from `grammar.json`.
255///
256/// Only the fields the emitter consumes are decoded; precedences,
257/// conflicts, externals, and other parser-only metadata are ignored.
258#[derive(Debug, Clone, Deserialize)]
259pub struct Grammar {
260    /// Grammar name (e.g. `"rust"`, `"typescript"`).
261    #[allow(dead_code)]
262    pub name: String,
263    /// Map from rule name (a vertex kind on the schema side) to
264    /// production. Entries are kept in lexical order so iteration
265    /// is deterministic.
266    pub rules: BTreeMap<String, Production>,
267    /// Supertypes declared in the grammar's `supertypes` field. A
268    /// supertype is a rule whose body is a `CHOICE` of `SYMBOL`
269    /// references; tree-sitter parsers report a node's kind as one
270    /// of the subtypes (e.g. `identifier`, `typed_parameter`) rather
271    /// than the supertype name (`parameter`), so the emitter needs to
272    /// know that a child kind in a subtype set should match the
273    /// supertype name when a SYMBOL references it.
274    #[serde(default, deserialize_with = "deserialize_supertypes")]
275    pub supertypes: std::collections::HashSet<String>,
276    /// Precomputed subtyping closure: `subtypes[symbol_name]` is the
277    /// set of vertex kinds that satisfy a SYMBOL `symbol_name`
278    /// reference on the schema side.
279    ///
280    /// Built once at [`Grammar::from_bytes`] time by walking each
281    /// hidden rule (`_`-prefixed), declared supertype, and named
282    /// `ALIAS { value: K, ... }` production to its leaf SYMBOLs and
283    /// recording the closure. This replaces the prior heuristic
284    /// `kind_satisfies_symbol` that walked the rule body on every
285    /// query: lookups are now O(1) and the relation is exactly the
286    /// transitive closure of "is reachable via hidden / supertype /
287    /// alias dispatch", with no over-expansion through non-hidden
288    /// non-supertype rule references.
289    #[serde(skip)]
290    pub subtypes: std::collections::HashMap<String, std::collections::HashSet<String>>,
291}
292
293fn deserialize_supertypes<'de, D>(
294    deserializer: D,
295) -> Result<std::collections::HashSet<String>, D::Error>
296where
297    D: serde::Deserializer<'de>,
298{
299    let entries: Vec<serde_json::Value> = Vec::deserialize(deserializer)?;
300    let mut out = std::collections::HashSet::new();
301    for entry in entries {
302        match entry {
303            serde_json::Value::String(s) => {
304                out.insert(s);
305            }
306            serde_json::Value::Object(map) => {
307                if let Some(serde_json::Value::String(name)) = map.get("name") {
308                    out.insert(name.clone());
309                }
310            }
311            _ => {}
312        }
313    }
314    Ok(out)
315}
316
317impl Grammar {
318    /// Parse a grammar's `grammar.json` bytes.
319    ///
320    /// Builds the subtyping closure as part of construction so every
321    /// downstream lookup is O(1). The closure is the least relation
322    /// containing `(K, K)` for every rule key `K` and closed under:
323    ///
324    /// - hidden-rule expansion: if `S` is hidden and a SYMBOL `S` may
325    ///   reach SYMBOL `K`, then `K ⊑ S`.
326    /// - supertype expansion: if `S` is in the grammar's supertypes
327    ///   block and `K` is one of `S`'s alternatives, then `K ⊑ S`.
328    /// - alias renaming: if a rule body contains
329    ///   `ALIAS { content: SYMBOL R, value: A, named: true }` where
330    ///   `R` reaches kind `K` (or `K = R` when no further hop), then
331    ///   `A ⊑ R` and `K ⊑ A`.
332    ///
333    /// # Errors
334    ///
335    /// Returns [`ParseError::EmitFailed`] when the bytes are not a
336    /// valid `grammar.json` document.
337    pub fn from_bytes(protocol: &str, bytes: &[u8]) -> Result<Self, ParseError> {
338        let mut grammar: Self =
339            serde_json::from_slice(bytes).map_err(|e| ParseError::EmitFailed {
340                protocol: protocol.to_owned(),
341                reason: format!("grammar.json deserialization failed: {e}"),
342            })?;
343        grammar.subtypes = compute_subtype_closure(&grammar);
344        Ok(grammar)
345    }
346}
347
348/// Compute the subtyping relation as a forward-indexed map from a
349/// SYMBOL name to the set of vertex kinds that satisfy that SYMBOL.
350fn compute_subtype_closure(
351    grammar: &Grammar,
352) -> std::collections::HashMap<String, std::collections::HashSet<String>> {
353    use std::collections::{HashMap, HashSet};
354    // Edges of the "kind X satisfies SYMBOL Y" relation. `K ⊑ Y` is
355    // recorded whenever Y is reached by walking the grammar's
356    // ALIAS / hidden-rule / supertype dispatch from a position where
357    // K is the actual vertex kind.
358    let mut subtypes: HashMap<String, HashSet<String>> = HashMap::new();
359    for name in grammar.rules.keys() {
360        subtypes
361            .entry(name.clone())
362            .or_default()
363            .insert(name.clone());
364    }
365
366    // First pass: collect the immediate "satisfies" edges from each
367    // expandable rule (hidden, supertype) to the kinds reachable by
368    // walking its body, plus alias edges.
369    fn walk<'g>(
370        grammar: &'g Grammar,
371        production: &'g Production,
372        visited: &mut HashSet<&'g str>,
373        out: &mut HashSet<String>,
374    ) {
375        match production {
376            Production::Symbol { name } => {
377                // Direct subtype.
378                out.insert(name.clone());
379                // Continue expansion through hidden / supertype rules
380                // so the closure traverses pass-through dispatch.
381                let expand = name.starts_with('_') || grammar.supertypes.contains(name.as_str());
382                if expand && visited.insert(name.as_str()) {
383                    if let Some(rule) = grammar.rules.get(name) {
384                        walk(grammar, rule, visited, out);
385                    }
386                }
387            }
388            Production::Choice { members } | Production::Seq { members } => {
389                for m in members {
390                    walk(grammar, m, visited, out);
391                }
392            }
393            Production::Alias {
394                content,
395                named,
396                value,
397            } => {
398                if *named && !value.is_empty() {
399                    out.insert(value.clone());
400                }
401                walk(grammar, content, visited, out);
402            }
403            Production::Repeat { content }
404            | Production::Repeat1 { content }
405            | Production::Optional { content }
406            | Production::Field { content, .. }
407            | Production::Token { content }
408            | Production::ImmediateToken { content }
409            | Production::Prec { content, .. }
410            | Production::PrecLeft { content, .. }
411            | Production::PrecRight { content, .. }
412            | Production::PrecDynamic { content, .. }
413            | Production::Reserved { content, .. } => {
414                walk(grammar, content, visited, out);
415            }
416            _ => {}
417        }
418    }
419
420    for (name, rule) in &grammar.rules {
421        let expand = name.starts_with('_') || grammar.supertypes.contains(name.as_str());
422        if !expand {
423            continue;
424        }
425        let mut visited: HashSet<&str> = HashSet::new();
426        visited.insert(name.as_str());
427        let mut reachable: HashSet<String> = HashSet::new();
428        walk(grammar, rule, &mut visited, &mut reachable);
429        for kind in &reachable {
430            subtypes
431                .entry(kind.clone())
432                .or_default()
433                .insert(name.clone());
434        }
435    }
436
437    // Aliases: scan every rule body for ALIAS { content, value }
438    // declarations. The kinds reachable from `content` satisfy
439    // `value`, AND (by construction) `value` satisfies the
440    // surrounding rule. Walking the ENTIRE grammar once captures
441    // every alias site, irrespective of which rule introduces it.
442    fn collect_aliases<'g>(production: &'g Production, out: &mut Vec<(String, &'g Production)>) {
443        match production {
444            Production::Alias {
445                content,
446                named,
447                value,
448            } => {
449                if *named && !value.is_empty() {
450                    out.push((value.clone(), content.as_ref()));
451                }
452                collect_aliases(content, out);
453            }
454            Production::Choice { members } | Production::Seq { members } => {
455                for m in members {
456                    collect_aliases(m, out);
457                }
458            }
459            Production::Repeat { content }
460            | Production::Repeat1 { content }
461            | Production::Optional { content }
462            | Production::Field { content, .. }
463            | Production::Token { content }
464            | Production::ImmediateToken { content }
465            | Production::Prec { content, .. }
466            | Production::PrecLeft { content, .. }
467            | Production::PrecRight { content, .. }
468            | Production::PrecDynamic { content, .. }
469            | Production::Reserved { content, .. } => {
470                collect_aliases(content, out);
471            }
472            _ => {}
473        }
474    }
475    let mut aliases: Vec<(String, &Production)> = Vec::new();
476    for rule in grammar.rules.values() {
477        collect_aliases(rule, &mut aliases);
478    }
479    for (alias_value, content) in aliases {
480        let mut visited: HashSet<&str> = HashSet::new();
481        let mut reachable: HashSet<String> = HashSet::new();
482        walk(grammar, content, &mut visited, &mut reachable);
483        // Aliased value satisfies itself and is satisfied by every
484        // kind its content can reach.
485        subtypes
486            .entry(alias_value.clone())
487            .or_default()
488            .insert(alias_value.clone());
489        for kind in reachable {
490            subtypes
491                .entry(kind)
492                .or_default()
493                .insert(alias_value.clone());
494        }
495    }
496
497    // Transitive close: `K ⊑ A` and `A ⊑ B` implies `K ⊑ B`. Iterate
498    // a few rounds; the relation is small so a quick fixed-point
499    // suffices in practice.
500    for _ in 0..8 {
501        let snapshot = subtypes.clone();
502        let mut changed = false;
503        for (kind, supers) in &snapshot {
504            let extra: HashSet<String> = supers
505                .iter()
506                .flat_map(|s| snapshot.get(s).cloned().unwrap_or_default())
507                .collect();
508            let entry = subtypes.entry(kind.clone()).or_default();
509            for s in extra {
510                if entry.insert(s) {
511                    changed = true;
512                }
513            }
514        }
515        if !changed {
516            break;
517        }
518    }
519
520    subtypes
521}
522
523// ═══════════════════════════════════════════════════════════════════
524// Format policy
525// ═══════════════════════════════════════════════════════════════════
526
527/// Whitespace and indentation policy applied during emission.
528///
529/// The default policy inserts a single space between adjacent tokens,
530/// a newline after `;` / `}` / `{`, and tracks indent on `{` / `}`
531/// boundaries. Per-language overrides (idiomatic indent width,
532/// trailing-comma rules, blank-line conventions) can ride alongside
533/// this struct in a follow-up branch; today's defaults aim only for
534/// syntactic validity.
535#[derive(Debug, Clone)]
536pub struct FormatPolicy {
537    /// Number of spaces per indent level.
538    pub indent_width: usize,
539    /// Tokens after which the walker breaks to a new line.
540    pub line_break_after: Vec<String>,
541    /// Tokens that increase indent on emission.
542    pub indent_open: Vec<String>,
543    /// Tokens that decrease indent on emission.
544    pub indent_close: Vec<String>,
545}
546
547impl Default for FormatPolicy {
548    fn default() -> Self {
549        Self {
550            indent_width: 2,
551            line_break_after: vec![";".into(), "{".into(), "}".into()],
552            indent_open: vec!["{".into()],
553            indent_close: vec!["}".into()],
554        }
555    }
556}
557
558// ═══════════════════════════════════════════════════════════════════
559// Emitter
560// ═══════════════════════════════════════════════════════════════════
561
562/// Emit a by-construction schema to source bytes.
563///
564/// `protocol` is the grammar / language name (used in error messages
565/// and to label the entry point).
566///
567/// The walker treats `schema.entries` as the ordered list of root
568/// vertices, falling back to a deterministic by-id ordering when
569/// `entries` is empty. Each root is emitted using the production
570/// associated with its kind in `grammar.rules`.
571///
572/// # Errors
573///
574/// Returns [`ParseError::EmitFailed`] when:
575///
576/// - the schema has no vertices
577/// - a root vertex's kind is not a grammar rule
578/// - a `SYMBOL` reference points at a kind with no rule and no schema
579///   child to resolve it to
580/// - a required `FIELD` has no corresponding edge in the schema
581pub fn emit_pretty(
582    protocol: &str,
583    schema: &Schema,
584    grammar: &Grammar,
585    policy: &FormatPolicy,
586) -> Result<Vec<u8>, ParseError> {
587    let roots = collect_roots(schema);
588    if roots.is_empty() {
589        return Err(ParseError::EmitFailed {
590            protocol: protocol.to_owned(),
591            reason: "schema has no entry vertices".to_owned(),
592        });
593    }
594
595    let mut out = Output::new(policy);
596    for (i, root) in roots.iter().enumerate() {
597        if i > 0 {
598            out.newline();
599        }
600        emit_vertex(protocol, schema, grammar, root, &mut out)?;
601    }
602    Ok(out.finish())
603}
604
605fn collect_roots(schema: &Schema) -> Vec<&panproto_gat::Name> {
606    if !schema.entries.is_empty() {
607        return schema
608            .entries
609            .iter()
610            .filter(|name| schema.vertices.contains_key(*name))
611            .collect();
612    }
613
614    // Fallback: every vertex that is not the target of any structural edge
615    // (sorted by id for determinism).
616    let mut targets: std::collections::HashSet<&panproto_gat::Name> =
617        std::collections::HashSet::new();
618    for edge in schema.edges.keys() {
619        targets.insert(&edge.tgt);
620    }
621    let mut roots: Vec<&panproto_gat::Name> = schema
622        .vertices
623        .keys()
624        .filter(|name| !targets.contains(name))
625        .collect();
626    roots.sort();
627    roots
628}
629
630fn emit_vertex(
631    protocol: &str,
632    schema: &Schema,
633    grammar: &Grammar,
634    vertex_id: &panproto_gat::Name,
635    out: &mut Output<'_>,
636) -> Result<(), ParseError> {
637    let vertex = schema
638        .vertices
639        .get(vertex_id)
640        .ok_or_else(|| ParseError::EmitFailed {
641            protocol: protocol.to_owned(),
642            reason: format!("vertex '{vertex_id}' not found"),
643        })?;
644
645    // Leaf shortcut: a vertex carrying a `literal-value` constraint
646    // and no outgoing structural edges is a terminal token. Emit the
647    // captured value directly. This handles identifiers, numeric
648    // literals, and string literals that the parser stored as
649    // `literal-value` even on by-construction schemas.
650    if let Some(literal) = literal_value(schema, vertex_id) {
651        if children_for(schema, vertex_id).is_empty() {
652            out.token(literal);
653            return Ok(());
654        }
655    }
656
657    let kind = vertex.kind.as_ref();
658    let edges = children_for(schema, vertex_id);
659    if let Some(rule) = grammar.rules.get(kind) {
660        let mut cursor = ChildCursor::new(&edges);
661        return emit_production(protocol, schema, grammar, vertex_id, rule, &mut cursor, out);
662    }
663
664    // No rule for this kind. The parser produced it via an ALIAS
665    // (tree-sitter's `alias($.something, $.actual_kind)`) or via an
666    // external scanner (e.g. YAML's `document` root). Fall back to
667    // walking the children directly so the inner content survives;
668    // surrounding tokens — whose only source is the missing rule —
669    // are necessarily absent.
670    for edge in &edges {
671        emit_vertex(protocol, schema, grammar, &edge.tgt, out)?;
672    }
673    Ok(())
674}
675
676/// Linear cursor over a vertex's outgoing edges, used to thread
677/// children through a production rule without double-consuming them.
678struct ChildCursor<'a> {
679    edges: &'a [&'a Edge],
680    consumed: Vec<bool>,
681}
682
683impl<'a> ChildCursor<'a> {
684    fn new(edges: &'a [&'a Edge]) -> Self {
685        Self {
686            edges,
687            consumed: vec![false; edges.len()],
688        }
689    }
690
691    /// Take the next unconsumed edge whose kind equals `field_name`.
692    fn take_field(&mut self, field_name: &str) -> Option<&'a Edge> {
693        for (i, edge) in self.edges.iter().enumerate() {
694            if !self.consumed[i] && edge.kind.as_ref() == field_name {
695                self.consumed[i] = true;
696                return Some(edge);
697            }
698        }
699        None
700    }
701
702    /// Take the next unconsumed edge whose target vertex satisfies
703    /// `predicate`. Returns the edge and the underlying production
704    /// resolution path is the caller's job.
705    fn take_matching(&mut self, predicate: impl Fn(&Edge) -> bool) -> Option<&'a Edge> {
706        for (i, edge) in self.edges.iter().enumerate() {
707            if !self.consumed[i] && predicate(edge) {
708                self.consumed[i] = true;
709                return Some(edge);
710            }
711        }
712        None
713    }
714
715    /// Whether any unconsumed edge satisfies `predicate`.
716    fn has_matching(&self, predicate: impl Fn(&Edge) -> bool) -> bool {
717        self.edges
718            .iter()
719            .enumerate()
720            .any(|(i, edge)| !self.consumed[i] && predicate(edge))
721    }
722}
723
724thread_local! {
725    static EMIT_DEPTH: std::cell::Cell<usize> = const { std::cell::Cell::new(0) };
726    /// Set of `(vertex_id, rule_name)` pairs that are currently being
727    /// walked by the recursion. A SYMBOL that resolves to a rule
728    /// already on this stack closes a μ-binder cycle: in the
729    /// coinductive reading, the rule walk at any vertex is the least
730    /// fixed point of `body[μ X . body / X]`, which unfolds at most
731    /// once, with the second visit returning the empty sequence (the
732    /// unit of the free token monoid). Examples that trigger this:
733    /// YAML's `stream` ⊃ `_b_blk_*` mutually-recursive chain, Rust's
734    /// `_expression` ⊃ `binary_expression` ⊃ `_expression`.
735    static EMIT_MU_FRAMES: std::cell::RefCell<std::collections::HashSet<(String, String)>> =
736        std::cell::RefCell::new(std::collections::HashSet::new());
737}
738
739/// Walk a rule at a vertex inside a μ-binder. The wrapping frame is
740/// pushed before recursion and popped after, so any SYMBOL inside
741/// `rule` that re-enters the same `(vertex_id, rule_name)` pair
742/// returns the empty sequence (μ X . body unfolds once).
743fn walk_in_mu_frame(
744    protocol: &str,
745    schema: &Schema,
746    grammar: &Grammar,
747    vertex_id: &panproto_gat::Name,
748    rule_name: &str,
749    rule: &Production,
750    cursor: &mut ChildCursor<'_>,
751    out: &mut Output<'_>,
752) -> Result<(), ParseError> {
753    let key = (vertex_id.to_string(), rule_name.to_owned());
754    let inserted = EMIT_MU_FRAMES.with(|frames| frames.borrow_mut().insert(key.clone()));
755    if !inserted {
756        // We are already walking this rule at this vertex deeper in
757        // the call stack. The coinductive μ-fixed-point reading
758        // returns the empty sequence here; the surrounding
759        // production resumes after the SYMBOL.
760        return Ok(());
761    }
762    let result = emit_production(protocol, schema, grammar, vertex_id, rule, cursor, out);
763    EMIT_MU_FRAMES.with(|frames| {
764        frames.borrow_mut().remove(&key);
765    });
766    result
767}
768
769fn emit_production(
770    protocol: &str,
771    schema: &Schema,
772    grammar: &Grammar,
773    vertex_id: &panproto_gat::Name,
774    production: &Production,
775    cursor: &mut ChildCursor<'_>,
776    out: &mut Output<'_>,
777) -> Result<(), ParseError> {
778    let depth = EMIT_DEPTH.with(|d| {
779        let v = d.get() + 1;
780        d.set(v);
781        v
782    });
783    if depth > 500 {
784        EMIT_DEPTH.with(|d| d.set(d.get() - 1));
785        return Err(ParseError::EmitFailed {
786            protocol: protocol.to_owned(),
787            reason: format!(
788                "emit_production recursion >500 (likely a cyclic grammar; \
789                     vertex='{vertex_id}')"
790            ),
791        });
792    }
793    let result = emit_production_inner(
794        protocol, schema, grammar, vertex_id, production, cursor, out,
795    );
796    EMIT_DEPTH.with(|d| d.set(d.get() - 1));
797    result
798}
799
800fn emit_production_inner(
801    protocol: &str,
802    schema: &Schema,
803    grammar: &Grammar,
804    vertex_id: &panproto_gat::Name,
805    production: &Production,
806    cursor: &mut ChildCursor<'_>,
807    out: &mut Output<'_>,
808) -> Result<(), ParseError> {
809    match production {
810        Production::String { value } => {
811            out.token(value);
812            Ok(())
813        }
814        Production::Pattern { value } => {
815            if let Some(literal) = literal_value(schema, vertex_id) {
816                out.token(literal);
817            } else {
818                out.token(&placeholder_for_pattern(value));
819            }
820            Ok(())
821        }
822        Production::Blank => Ok(()),
823        Production::Symbol { name } => {
824            if name.starts_with('_') {
825                // Hidden rule: not a vertex kind on the schema side.
826                // Inline-expand the rule body so its children take
827                // edges from the current cursor, instead of trying to
828                // take a single child edge that "satisfies" the
829                // hidden rule and discarding the rest of the body
830                // (which would drop tokens like `=` and the trailing
831                // value SYMBOL inside e.g. TOML's `_inline_pair`).
832                //
833                // Wrapped in a μ-frame so a hidden rule that
834                // references its own kind cyclically (or another
835                // hidden rule that closes the cycle) unfolds once
836                // and then collapses to the empty sequence at the
837                // second visit, rather than blowing the stack.
838                if let Some(rule) = grammar.rules.get(name) {
839                    walk_in_mu_frame(
840                        protocol, schema, grammar, vertex_id, name, rule, cursor, out,
841                    )
842                } else {
843                    // External hidden rule (declared in the
844                    // grammar's `externals` block, scanned by C code,
845                    // not listed in `rules`). Heuristic fallback:
846                    // line-ending / EOF externals are universally
847                    // newline-or-empty, so emitting a single newline
848                    // is the right default for grammars like TOML
849                    // whose `pair` SEQ trails into
850                    // `_line_ending_or_eof`. Anything else falls
851                    // through silently.
852                    if name.contains("line_ending")
853                        || name.contains("newline")
854                        || name.ends_with("_or_eof")
855                    {
856                        out.newline();
857                    }
858                    Ok(())
859                }
860            } else if let Some(edge) = take_symbol_match(grammar, schema, cursor, name) {
861                // For supertype / hidden-rule dispatch the child's
862                // own kind names the actual production to walk
863                // (`child.kind` IS the subtype). For ALIAS the
864                // dependent-optic context is carried by the
865                // surrounding `Production::Alias` branch, which calls
866                // `emit_aliased_child` directly; we don't reach here
867                // for that case. So walking `grammar.rules[child.kind]`
868                // via `emit_vertex` is correct: the dependent-optic
869                // path is preserved at every site where it actually
870                // diverges from `child.kind`.
871                emit_vertex(protocol, schema, grammar, &edge.tgt, out)
872            } else if vertex_id_kind(schema, vertex_id) == Some(name.as_str()) {
873                let rule = grammar
874                    .rules
875                    .get(name)
876                    .ok_or_else(|| ParseError::EmitFailed {
877                        protocol: protocol.to_owned(),
878                        reason: format!("no production for SYMBOL '{name}'"),
879                    })?;
880                // Self-reference (`X = ... SYMBOL X ...`): wrap in a
881                // μ-frame so re-entry collapses to the empty sequence.
882                walk_in_mu_frame(
883                    protocol, schema, grammar, vertex_id, name, rule, cursor, out,
884                )
885            } else {
886                // Named rule with no matching child: emit nothing and
887                // let the surrounding CHOICE / OPTIONAL / REPEAT
888                // resolve the absence.
889                Ok(())
890            }
891        }
892        Production::Seq { members } => {
893            for member in members {
894                emit_production(protocol, schema, grammar, vertex_id, member, cursor, out)?;
895            }
896            Ok(())
897        }
898        Production::Choice { members } => {
899            if let Some(matched) =
900                pick_choice_with_cursor(schema, grammar, vertex_id, cursor, members)
901            {
902                emit_production(protocol, schema, grammar, vertex_id, matched, cursor, out)
903            } else {
904                Ok(())
905            }
906        }
907        Production::Repeat { content } | Production::Repeat1 { content } => {
908            let mut emitted_any = false;
909            loop {
910                let cursor_snap = cursor.consumed.clone();
911                let out_snap = out.snapshot();
912                let consumed_before = cursor.consumed.iter().filter(|&&c| c).count();
913                let result =
914                    emit_production(protocol, schema, grammar, vertex_id, content, cursor, out);
915                let consumed_after = cursor.consumed.iter().filter(|&&c| c).count();
916                if result.is_err() || consumed_after == consumed_before {
917                    cursor.consumed = cursor_snap;
918                    out.restore(out_snap);
919                    break;
920                }
921                emitted_any = true;
922            }
923            if matches!(production, Production::Repeat1 { .. }) && !emitted_any {
924                emit_production(protocol, schema, grammar, vertex_id, content, cursor, out)?;
925            }
926            Ok(())
927        }
928        Production::Optional { content } => {
929            let cursor_snap = cursor.consumed.clone();
930            let out_snap = out.snapshot();
931            let consumed_before = cursor.consumed.iter().filter(|&&c| c).count();
932            let result =
933                emit_production(protocol, schema, grammar, vertex_id, content, cursor, out);
934            // OPTIONAL is a backtracking site: if the inner production
935            // errored *or* made no progress without leaving a witness
936            // constraint, restore both cursor and output to their
937            // pre-attempt state. Mirrors `Repeat`'s loop body.
938            if result.is_err() {
939                cursor.consumed = cursor_snap;
940                out.restore(out_snap);
941                return result;
942            }
943            let consumed_after = cursor.consumed.iter().filter(|&&c| c).count();
944            if consumed_after == consumed_before
945                && !has_relevant_constraint(content, schema, vertex_id)
946            {
947                cursor.consumed = cursor_snap;
948                out.restore(out_snap);
949            }
950            Ok(())
951        }
952        Production::Field { name, content } => {
953            if let Some(edge) = cursor.take_field(name) {
954                emit_in_child_context(protocol, schema, grammar, &edge.tgt, content, out)
955            } else if first_symbol(content).is_none() {
956                // FIELD wraps a non-child production (e.g. a literal
957                // STRING operator like `+` in a binary_expression, or
958                // a CHOICE of STRING tokens). The walker captures
959                // these as interstitials rather than vertices, so the
960                // schema has no field edge to consume; emit the
961                // content in place so the operator / keyword survives
962                // the round-trip.
963                emit_production(protocol, schema, grammar, vertex_id, content, cursor, out)
964            } else {
965                Ok(())
966            }
967        }
968        Production::Alias {
969            content,
970            named,
971            value,
972        } => {
973            // A named ALIAS rewrites the parser-visible kind to
974            // `value`. If the cursor has an unconsumed child whose
975            // kind matches that alias name, take it and emit the
976            // child using the alias's INNER content as the rule
977            // (e.g. `ALIAS { SYMBOL real_rule, value: "kind_x" }`
978            // means a `kind_x` vertex on the schema should be walked
979            // through `real_rule`'s body, not through whatever rule
980            // happens to be keyed under `kind_x`). This is the
981            // dependent-optic shape: the rule the emitter walks at a
982            // child position is determined by the parent's chosen
983            // alias, not by the child kind alone — without it,
984            // grammars like YAML that introduce the same kind through
985            // many ALIAS sites lose the parent context the moment
986            // emit_vertex is called.
987            if *named && !value.is_empty() {
988                if let Some(edge) = cursor.take_matching(|edge| {
989                    schema
990                        .vertices
991                        .get(&edge.tgt)
992                        .map(|v| v.kind.as_ref() == value.as_str())
993                        .unwrap_or(false)
994                }) {
995                    return emit_aliased_child(protocol, schema, grammar, &edge.tgt, content, out);
996                }
997            }
998            emit_production(protocol, schema, grammar, vertex_id, content, cursor, out)
999        }
1000        Production::Token { content }
1001        | Production::ImmediateToken { content }
1002        | Production::Prec { content, .. }
1003        | Production::PrecLeft { content, .. }
1004        | Production::PrecRight { content, .. }
1005        | Production::PrecDynamic { content, .. }
1006        | Production::Reserved { content, .. } => {
1007            emit_production(protocol, schema, grammar, vertex_id, content, cursor, out)
1008        }
1009    }
1010}
1011
1012/// Take the next cursor edge whose target vertex's kind matches the
1013/// SYMBOL `name` directly or via inline expansion of a hidden rule.
1014fn take_symbol_match<'a>(
1015    grammar: &Grammar,
1016    schema: &Schema,
1017    cursor: &mut ChildCursor<'a>,
1018    name: &str,
1019) -> Option<&'a Edge> {
1020    cursor.take_matching(|edge| {
1021        let target_kind = schema.vertices.get(&edge.tgt).map(|v| v.kind.as_ref());
1022        kind_satisfies_symbol(grammar, target_kind, name)
1023    })
1024}
1025
1026/// Decide whether a schema vertex of kind `target_kind` satisfies a
1027/// SYMBOL `name` reference in the grammar.
1028///
1029/// Operates as an O(1) lookup against the precomputed subtype
1030/// closure built at [`Grammar::from_bytes`]. The semantic content is
1031/// "K satisfies SYMBOL S iff K is reachable from S by walking the
1032/// grammar's hidden, supertype, and named-alias dispatch": this is
1033/// exactly the relation tree-sitter induces on `(parser-visible kind,
1034/// rule-position)` pairs.
1035fn kind_satisfies_symbol(grammar: &Grammar, target_kind: Option<&str>, name: &str) -> bool {
1036    let Some(target) = target_kind else {
1037        return false;
1038    };
1039    if target == name {
1040        return true;
1041    }
1042    grammar
1043        .subtypes
1044        .get(target)
1045        .is_some_and(|set| set.contains(name))
1046}
1047
1048/// Emit a child reached through an ALIAS production using the
1049/// alias's inner content as the rule, not `grammar.rules[child.kind]`.
1050///
1051/// This carries the dependent-optic context across the ALIAS edge:
1052/// at the parent rule's site we know which underlying production the
1053/// alias wraps (typically `SYMBOL real_rule`), and that's the
1054/// production that should drive the emit walk on the child's
1055/// children. Looking up `grammar.rules.get(child.kind)` instead would
1056/// either fail (the renamed kind has no top-level rule, e.g. YAML's
1057/// `block_mapping_pair`) or pick an arbitrary same-kinded rule from
1058/// elsewhere in the grammar.
1059///
1060/// Walk-context invariant. The dependent-optic shape of `emit_pretty`
1061/// says: the production walked at any vertex is determined by the
1062/// path from the root through the grammar, not by the vertex kind in
1063/// isolation. Two dispatch sites realise that invariant:
1064///
1065/// * [`emit_vertex`] looks up `grammar.rules[child.kind]` and walks
1066///   it. Correct for supertype / hidden-rule dispatch: the child's
1067///   kind on the schema IS the subtype tree-sitter selected, so its
1068///   top-level rule is the right production to walk.
1069/// * `emit_aliased_child` threads the parent rule's `Production`
1070///   directly (the inner `content` of `Production::Alias`) and walks
1071///   it on the child's children. Correct for ALIAS dispatch: the
1072///   child's kind on the schema is the alias's `value` (a renamed
1073///   kind that may have no top-level rule), and the production to
1074///   walk is the alias's content body, supplied by the parent.
1075///
1076/// Together these cover every site where the rule-walked-at-child
1077/// diverges from `grammar.rules[child.kind]`; the recursion site for
1078/// plain SYMBOL therefore correctly delegates to `emit_vertex`, and
1079/// we do not need a richer `WalkContext` value passed by reference.
1080/// The grammar dependency is the thread.
1081fn emit_aliased_child(
1082    protocol: &str,
1083    schema: &Schema,
1084    grammar: &Grammar,
1085    child_id: &panproto_gat::Name,
1086    content: &Production,
1087    out: &mut Output<'_>,
1088) -> Result<(), ParseError> {
1089    // Leaf shortcut: if the child has a literal-value and no
1090    // structural children, emit the captured text. Identifiers and
1091    // similar terminals reach here when an ALIAS wraps a SYMBOL that
1092    // resolves to a PATTERN.
1093    if let Some(literal) = literal_value(schema, child_id) {
1094        if children_for(schema, child_id).is_empty() {
1095            out.token(literal);
1096            return Ok(());
1097        }
1098    }
1099
1100    // Resolve `content` to a rule when it's a SYMBOL (the dominant
1101    // shape: `ALIAS { content: SYMBOL real_rule, value: "kind_x" }`).
1102    if let Production::Symbol { name } = content {
1103        if let Some(rule) = grammar.rules.get(name) {
1104            let edges = children_for(schema, child_id);
1105            let mut cursor = ChildCursor::new(&edges);
1106            return emit_production(protocol, schema, grammar, child_id, rule, &mut cursor, out);
1107        }
1108    }
1109
1110    // Other ALIAS contents (CHOICE, SEQ, literals) walk in place.
1111    let edges = children_for(schema, child_id);
1112    let mut cursor = ChildCursor::new(&edges);
1113    emit_production(
1114        protocol,
1115        schema,
1116        grammar,
1117        child_id,
1118        content,
1119        &mut cursor,
1120        out,
1121    )
1122}
1123
1124fn emit_in_child_context(
1125    protocol: &str,
1126    schema: &Schema,
1127    grammar: &Grammar,
1128    child_id: &panproto_gat::Name,
1129    production: &Production,
1130    out: &mut Output<'_>,
1131) -> Result<(), ParseError> {
1132    // If `production` is a structural wrapper (CHOICE / SEQ /
1133    // OPTIONAL / ...) whose referenced symbols cover the child's own
1134    // kind, the child IS the production's target node and the right
1135    // emit path is `emit_vertex(child)` (which honours the
1136    // literal-value leaf shortcut). Without this guard, FIELD(pattern,
1137    // CHOICE { _pattern, self }) on an identifier child walks the
1138    // CHOICE on the identifier's empty cursor, falls through to the
1139    // first non-BLANK alt, and loses the captured identifier text.
1140    if !matches!(production, Production::Symbol { .. }) {
1141        let child_kind = schema.vertices.get(child_id).map(|v| v.kind.as_ref());
1142        let symbols = referenced_symbols(production);
1143        if symbols
1144            .iter()
1145            .any(|s| kind_satisfies_symbol(grammar, child_kind, s) || child_kind == Some(s))
1146        {
1147            return emit_vertex(protocol, schema, grammar, child_id, out);
1148        }
1149    }
1150    match production {
1151        Production::Symbol { .. } => emit_vertex(protocol, schema, grammar, child_id, out),
1152        _ => {
1153            let edges = children_for(schema, child_id);
1154            let mut cursor = ChildCursor::new(&edges);
1155            emit_production(
1156                protocol,
1157                schema,
1158                grammar,
1159                child_id,
1160                production,
1161                &mut cursor,
1162                out,
1163            )
1164        }
1165    }
1166}
1167
1168fn pick_choice_with_cursor<'a>(
1169    schema: &Schema,
1170    grammar: &Grammar,
1171    vertex_id: &panproto_gat::Name,
1172    cursor: &ChildCursor<'_>,
1173    alternatives: &'a [Production],
1174) -> Option<&'a Production> {
1175    // Discriminator-driven dispatch (highest priority): when the
1176    // walker recorded a `chose-alt-fingerprint` constraint at parse
1177    // time, dispatch directly against that. This is the categorical
1178    // discriminator: it survives stripping of byte-position
1179    // constraints (so by-construction round-trips work) and is the
1180    // explicit witness of which CHOICE alternative the parser took.
1181    //
1182    // Falls back to the live `interstitial-*` substring blob when no
1183    // fingerprint is present (e.g. instances built by callers that
1184    // bypass the AstWalker). Both blobs are scored by the longest
1185    // STRING-literal token in an alternative that matches; the
1186    // length tiebreak prefers `&&` over `&`, `==` over `=`, etc.
1187    let constraint_blob = schema
1188        .constraints
1189        .get(vertex_id)
1190        .map(|cs| {
1191            let fingerprint: Option<&str> = cs
1192                .iter()
1193                .find(|c| c.sort.as_ref() == "chose-alt-fingerprint")
1194                .map(|c| c.value.as_str());
1195            if let Some(fp) = fingerprint {
1196                fp.to_owned()
1197            } else {
1198                cs.iter()
1199                    .filter(|c| {
1200                        let s = c.sort.as_ref();
1201                        s.starts_with("interstitial-") && !s.ends_with("-start-byte")
1202                    })
1203                    .map(|c| c.value.as_str())
1204                    .collect::<Vec<&str>>()
1205                    .join(" ")
1206            }
1207        })
1208        .unwrap_or_default();
1209    let child_kinds: Vec<&str> = schema
1210        .constraints
1211        .get(vertex_id)
1212        .and_then(|cs| {
1213            cs.iter()
1214                .find(|c| c.sort.as_ref() == "chose-alt-child-kinds")
1215                .map(|c| c.value.split_whitespace().collect())
1216        })
1217        .unwrap_or_default();
1218    if !constraint_blob.is_empty() {
1219        // Primary score: literal-token match length. This dominates
1220        // alt selection so existing language tests that depend on
1221        // literal-only fingerprints keep working.
1222        // Secondary score (tiebreaker only): named-symbol kind match
1223        // count, read from the separate `chose-alt-child-kinds`
1224        // constraint (kept apart from the literal fingerprint so
1225        // identifiers like `:` in the kind list don't contaminate the
1226        // literal match). An alt that matches the recorded kinds is a
1227        // stronger witness than one whose only
1228        // overlap is literal punctuation.
1229        let mut best_literal: usize = 0;
1230        let mut best_symbols: usize = 0;
1231        let mut best_alt: Option<&Production> = None;
1232        let mut tied = false;
1233        for alt in alternatives {
1234            let strings = literal_strings(alt);
1235            if strings.is_empty() {
1236                continue;
1237            }
1238            let literal_score = strings
1239                .iter()
1240                .filter(|s| constraint_blob.contains(s.as_str()))
1241                .map(String::len)
1242                .sum::<usize>();
1243            if literal_score == 0 {
1244                continue;
1245            }
1246            // Symbol score is computed only as a tiebreaker among alts
1247            // whose literal-token coverage is the same; it never lifts
1248            // an alt above one with a strictly higher literal score.
1249            // Reads the `chose-alt-child-kinds` constraint (a separate
1250            // sequence the walker emits, kept apart from the literal
1251            // fingerprint to avoid cross-contamination).
1252            let symbol_score = if literal_score >= best_literal && !child_kinds.is_empty() {
1253                let symbols = referenced_symbols(alt);
1254                symbols
1255                    .iter()
1256                    .filter(|sym| {
1257                        let sym_str: &str = sym;
1258                        if child_kinds.contains(&sym_str) {
1259                            return true;
1260                        }
1261                        grammar.subtypes.get(sym_str).is_some_and(|sub_set| {
1262                            sub_set
1263                                .iter()
1264                                .any(|sub| child_kinds.contains(&sub.as_str()))
1265                        })
1266                    })
1267                    .count()
1268            } else {
1269                0
1270            };
1271            let better = literal_score > best_literal
1272                || (literal_score == best_literal && symbol_score > best_symbols);
1273            let same = literal_score == best_literal && symbol_score == best_symbols;
1274            if better {
1275                best_literal = literal_score;
1276                best_symbols = symbol_score;
1277                best_alt = Some(alt);
1278                tied = false;
1279            } else if same && best_alt.is_some() {
1280                tied = true;
1281            }
1282        }
1283        // Only commit to an alt when the fingerprint discriminates it
1284        // uniquely. A tie means the alts share the same literal token
1285        // set (e.g. JSON's `string = CHOICE { SEQ { '"', '"' }, SEQ {
1286        // '"', _string_content, '"' } }` — both alts contain just the
1287        // two `"` tokens). In that case fall through to cursor-based
1288        // dispatch, which uses the actual edge structure.
1289        if let Some(alt) = best_alt {
1290            if !tied {
1291                return Some(alt);
1292            }
1293        }
1294    }
1295
1296    // Cursor-driven dispatch: pick the alternative whose body
1297    // references at least one SYMBOL whose target kind is present in
1298    // the unconsumed cursor edges. `referenced_symbols` walks the
1299    // alternative recursively (across nested SEQs, REPEATs, OPTIONALs,
1300    // FIELDs, etc.) so a leading optional like `attribute_item` does
1301    // not block matching when only the trailing required symbol is
1302    // present on the schema.
1303    for alt in alternatives {
1304        let symbols = referenced_symbols(alt);
1305        if !symbols.is_empty()
1306            && cursor.has_matching(|edge| {
1307                let tk = schema.vertices.get(&edge.tgt).map(|v| v.kind.as_ref());
1308                symbols
1309                    .iter()
1310                    .any(|s| kind_satisfies_symbol(grammar, tk, s))
1311            })
1312        {
1313            return Some(alt);
1314        }
1315    }
1316
1317    // FIELD dispatch: pick an alternative whose FIELD name matches an
1318    // unconsumed edge kind.
1319    let edge_kinds: Vec<&str> = cursor
1320        .edges
1321        .iter()
1322        .enumerate()
1323        .filter(|(i, _)| !cursor.consumed[*i])
1324        .map(|(_, e)| e.kind.as_ref())
1325        .collect();
1326    for alt in alternatives {
1327        if has_field_in(alt, &edge_kinds) {
1328            return Some(alt);
1329        }
1330    }
1331
1332    // No cursor-driven match. Fall back to:
1333    //
1334    // - BLANK (the explicit empty alternative) when present, so an
1335    //   OPTIONAL-shaped CHOICE compiles to nothing.
1336    // - The first non-`BLANK` alternative as a last resort, used by
1337    //   STRING-only alternatives (keyword tokens) and other choices
1338    //   that don't reach the cursor.
1339    //
1340    // The previous "match own_kind" branch is intentionally absent:
1341    // when an alt's first SYMBOL equals the current vertex's kind, the
1342    // caller is already emitting that vertex's own rule. Recursing
1343    // into the alt would cause a self-loop in the rule walk.
1344    let _ = (schema, vertex_id);
1345    if alternatives.iter().any(|a| matches!(a, Production::Blank)) {
1346        return alternatives.iter().find(|a| matches!(a, Production::Blank));
1347    }
1348    alternatives
1349        .iter()
1350        .find(|alt| !matches!(alt, Production::Blank))
1351}
1352
1353/// Collect every literal STRING token directly inside `production`
1354/// (without descending into SYMBOLs / hidden rules). Used to score
1355/// CHOICE alternatives against the parent vertex's interstitials so
1356/// the right operator / keyword form is picked when the schema
1357/// preserves interstitial fragments from a prior parse.
1358fn literal_strings(production: &Production) -> Vec<String> {
1359    let mut out = Vec::new();
1360    fn walk(p: &Production, out: &mut Vec<String>) {
1361        match p {
1362            Production::String { value } if !value.is_empty() => {
1363                out.push(value.clone());
1364            }
1365            Production::Choice { members } | Production::Seq { members } => {
1366                for m in members {
1367                    walk(m, out);
1368                }
1369            }
1370            Production::Repeat { content }
1371            | Production::Repeat1 { content }
1372            | Production::Optional { content }
1373            | Production::Field { content, .. }
1374            | Production::Alias { content, .. }
1375            | Production::Token { content }
1376            | Production::ImmediateToken { content }
1377            | Production::Prec { content, .. }
1378            | Production::PrecLeft { content, .. }
1379            | Production::PrecRight { content, .. }
1380            | Production::PrecDynamic { content, .. }
1381            | Production::Reserved { content, .. } => walk(content, out),
1382            _ => {}
1383        }
1384    }
1385    walk(production, &mut out);
1386    out
1387}
1388
1389/// Collect every SYMBOL name reachable from `production` without
1390/// crossing into nested rules. Used by `pick_choice_with_cursor` to
1391/// rank alternatives by "any SYMBOL inside this alt matches something
1392/// on the cursor", instead of just the first SYMBOL: a leading
1393/// optional like `attribute_item` then `parameter` is otherwise
1394/// rejected when only the parameter children are present.
1395fn referenced_symbols(production: &Production) -> Vec<&str> {
1396    let mut out = Vec::new();
1397    fn walk<'a>(p: &'a Production, out: &mut Vec<&'a str>) {
1398        match p {
1399            Production::Symbol { name } => out.push(name.as_str()),
1400            Production::Choice { members } | Production::Seq { members } => {
1401                for m in members {
1402                    walk(m, out);
1403                }
1404            }
1405            Production::Repeat { content }
1406            | Production::Repeat1 { content }
1407            | Production::Optional { content }
1408            | Production::Field { content, .. }
1409            | Production::Alias { content, .. }
1410            | Production::Token { content }
1411            | Production::ImmediateToken { content }
1412            | Production::Prec { content, .. }
1413            | Production::PrecLeft { content, .. }
1414            | Production::PrecRight { content, .. }
1415            | Production::PrecDynamic { content, .. }
1416            | Production::Reserved { content, .. } => walk(content, out),
1417            _ => {}
1418        }
1419    }
1420    walk(production, &mut out);
1421    out
1422}
1423
1424fn first_symbol(production: &Production) -> Option<&str> {
1425    match production {
1426        Production::Symbol { name } => Some(name),
1427        Production::Seq { members } => members.iter().find_map(first_symbol),
1428        Production::Choice { members } => members.iter().find_map(first_symbol),
1429        Production::Repeat { content }
1430        | Production::Repeat1 { content }
1431        | Production::Optional { content }
1432        | Production::Field { content, .. }
1433        | Production::Alias { content, .. }
1434        | Production::Token { content }
1435        | Production::ImmediateToken { content }
1436        | Production::Prec { content, .. }
1437        | Production::PrecLeft { content, .. }
1438        | Production::PrecRight { content, .. }
1439        | Production::PrecDynamic { content, .. }
1440        | Production::Reserved { content, .. } => first_symbol(content),
1441        _ => None,
1442    }
1443}
1444
1445fn has_field_in(production: &Production, edge_kinds: &[&str]) -> bool {
1446    match production {
1447        Production::Field { name, .. } => edge_kinds.contains(&name.as_str()),
1448        Production::Seq { members } | Production::Choice { members } => {
1449            members.iter().any(|m| has_field_in(m, edge_kinds))
1450        }
1451        Production::Repeat { content }
1452        | Production::Repeat1 { content }
1453        | Production::Optional { content }
1454        | Production::Alias { content, .. }
1455        | Production::Token { content }
1456        | Production::ImmediateToken { content }
1457        | Production::Prec { content, .. }
1458        | Production::PrecLeft { content, .. }
1459        | Production::PrecRight { content, .. }
1460        | Production::PrecDynamic { content, .. }
1461        | Production::Reserved { content, .. } => has_field_in(content, edge_kinds),
1462        _ => false,
1463    }
1464}
1465
1466fn has_relevant_constraint(
1467    production: &Production,
1468    schema: &Schema,
1469    vertex_id: &panproto_gat::Name,
1470) -> bool {
1471    let constraints = match schema.constraints.get(vertex_id) {
1472        Some(c) => c,
1473        None => return false,
1474    };
1475    fn walk(production: &Production, constraints: &[panproto_schema::Constraint]) -> bool {
1476        match production {
1477            Production::String { value } => constraints
1478                .iter()
1479                .any(|c| c.value == *value || c.sort.as_ref() == value),
1480            Production::Field { name, content } => {
1481                constraints.iter().any(|c| c.sort.as_ref() == name) || walk(content, constraints)
1482            }
1483            Production::Seq { members } | Production::Choice { members } => {
1484                members.iter().any(|m| walk(m, constraints))
1485            }
1486            Production::Repeat { content }
1487            | Production::Repeat1 { content }
1488            | Production::Optional { content }
1489            | Production::Alias { content, .. }
1490            | Production::Token { content }
1491            | Production::ImmediateToken { content }
1492            | Production::Prec { content, .. }
1493            | Production::PrecLeft { content, .. }
1494            | Production::PrecRight { content, .. }
1495            | Production::PrecDynamic { content, .. }
1496            | Production::Reserved { content, .. } => walk(content, constraints),
1497            _ => false,
1498        }
1499    }
1500    walk(production, constraints)
1501}
1502
1503fn children_for<'a>(schema: &'a Schema, vertex_id: &panproto_gat::Name) -> Vec<&'a Edge> {
1504    let mut edges: Vec<&Edge> = schema
1505        .edges
1506        .keys()
1507        .filter(|e| &e.src == vertex_id)
1508        .collect();
1509    edges.sort_by_key(|e| {
1510        // Edges with an explicit ordering position come first; remaining
1511        // edges sort lexicographically by (kind, target id) for
1512        // deterministic walks.
1513        let pos = schema.orderings.get(*e).copied().unwrap_or(u32::MAX);
1514        (pos, e.kind.clone(), e.tgt.clone())
1515    });
1516    edges
1517}
1518
1519fn vertex_id_kind<'a>(schema: &'a Schema, vertex_id: &panproto_gat::Name) -> Option<&'a str> {
1520    schema.vertices.get(vertex_id).map(|v| v.kind.as_ref())
1521}
1522
1523fn literal_value<'a>(schema: &'a Schema, vertex_id: &panproto_gat::Name) -> Option<&'a str> {
1524    schema
1525        .constraints
1526        .get(vertex_id)?
1527        .iter()
1528        .find(|c| c.sort.as_ref() == "literal-value")
1529        .map(|c| c.value.as_str())
1530}
1531
1532fn placeholder_for_pattern(pattern: &str) -> String {
1533    // Heuristic placeholder for unconstrained PATTERN terminals.
1534    //
1535    // First handle the "the regex IS a literal escape" cases that
1536    // tree-sitter grammars use as separators (`\n`, `\r\n`, `;`,
1537    // etc.); emitting the matching character is always preferable
1538    // to a `_x` identifier-like placeholder when the surrounding
1539    // grammar expects a separator.
1540    let simple_lit = decode_simple_pattern_literal(pattern);
1541    if let Some(lit) = simple_lit {
1542        return lit;
1543    }
1544
1545    if pattern.contains("[0-9]") || pattern.contains("\\d") {
1546        "0".into()
1547    } else if pattern.contains("[a-zA-Z_]") || pattern.contains("\\w") {
1548        "_x".into()
1549    } else if pattern.contains('"') || pattern.contains('\'') {
1550        "\"\"".into()
1551    } else {
1552        "_".into()
1553    }
1554}
1555
1556/// Decode a tree-sitter PATTERN whose regex is a simple literal
1557/// (newline, semicolon, comma, etc.) to the byte sequence it matches.
1558/// Returns `None` for patterns with character classes, alternations,
1559/// or quantifiers; the caller falls back to the heuristic placeholder.
1560fn decode_simple_pattern_literal(pattern: &str) -> Option<String> {
1561    // Skip patterns containing regex metachars that would broaden the
1562    // match beyond a single literal byte sequence.
1563    if pattern
1564        .chars()
1565        .any(|c| matches!(c, '[' | ']' | '(' | ')' | '*' | '+' | '?' | '|' | '{' | '}'))
1566    {
1567        return None;
1568    }
1569    let mut out = String::new();
1570    let mut chars = pattern.chars();
1571    while let Some(c) = chars.next() {
1572        if c == '\\' {
1573            match chars.next() {
1574                Some('n') => out.push('\n'),
1575                Some('r') => out.push('\r'),
1576                Some('t') => out.push('\t'),
1577                Some('\\') => out.push('\\'),
1578                Some('/') => out.push('/'),
1579                Some(other) => out.push(other),
1580                None => return None,
1581            }
1582        } else {
1583            out.push(c);
1584        }
1585    }
1586    Some(out)
1587}
1588
1589// ═══════════════════════════════════════════════════════════════════
1590// Token list output with Spacing algebra
1591// ═══════════════════════════════════════════════════════════════════
1592//
1593// Emit produces a free monoid over `Token`. Layout (spaces, newlines,
1594// indentation) is a homomorphism `Vec<Token> -> Vec<u8>` parameterised
1595// by `FormatPolicy`. Separating the structural output from the layout
1596// decision means each phase has one job: emit walks the grammar and
1597// pushes tokens; layout is a single fold, locally driven by adjacent
1598// pairs and a depth counter. Snapshot/restore is just `tokens.len()`.
1599
1600#[derive(Clone)]
1601enum Token {
1602    /// A user-visible terminal contributed by the grammar.
1603    Lit(String),
1604    /// `indent_open` marker emitted when a `Lit` matched the policy's
1605    /// open list. Carried as a separate token so layout can decide to
1606    /// break + indent without re-scanning.
1607    IndentOpen,
1608    /// `indent_close` marker emitted before a closer-`Lit`.
1609    IndentClose,
1610    /// "Break a line here if not already at line start" — used after
1611    /// statements/declarations and after open braces.
1612    LineBreak,
1613}
1614
1615struct Output<'a> {
1616    tokens: Vec<Token>,
1617    policy: &'a FormatPolicy,
1618}
1619
1620#[derive(Clone)]
1621struct OutputSnapshot {
1622    tokens_len: usize,
1623}
1624
1625impl<'a> Output<'a> {
1626    fn new(policy: &'a FormatPolicy) -> Self {
1627        Self {
1628            tokens: Vec::new(),
1629            policy,
1630        }
1631    }
1632
1633    fn token(&mut self, value: &str) {
1634        if value.is_empty() {
1635            return;
1636        }
1637
1638        if self.policy.indent_close.iter().any(|t| t == value) {
1639            self.tokens.push(Token::IndentClose);
1640        }
1641
1642        self.tokens.push(Token::Lit(value.to_owned()));
1643
1644        if self.policy.indent_open.iter().any(|t| t == value) {
1645            self.tokens.push(Token::IndentOpen);
1646            self.tokens.push(Token::LineBreak);
1647        } else if self.policy.line_break_after.iter().any(|t| t == value) {
1648            self.tokens.push(Token::LineBreak);
1649        }
1650    }
1651
1652    fn newline(&mut self) {
1653        self.tokens.push(Token::LineBreak);
1654    }
1655
1656    fn snapshot(&self) -> OutputSnapshot {
1657        OutputSnapshot {
1658            tokens_len: self.tokens.len(),
1659        }
1660    }
1661
1662    fn restore(&mut self, snap: OutputSnapshot) {
1663        self.tokens.truncate(snap.tokens_len);
1664    }
1665
1666    fn finish(self) -> Vec<u8> {
1667        layout(&self.tokens, self.policy)
1668    }
1669}
1670
1671/// Fold a token list into bytes. The algebra:
1672/// * adjacent `Lit`s get a single space iff `needs_space_between(a, b)`,
1673/// * `IndentOpen` / `IndentClose` adjust a depth counter,
1674/// * `LineBreak` writes `\n` if not already at line start, then the
1675///   next `Lit` writes `indent * indent_width` spaces of indent.
1676fn layout(tokens: &[Token], policy: &FormatPolicy) -> Vec<u8> {
1677    let mut bytes = Vec::new();
1678    let mut indent: usize = 0;
1679    let mut at_line_start = true;
1680    let mut last_lit: Option<&str> = None;
1681
1682    for tok in tokens {
1683        match tok {
1684            Token::IndentOpen => indent += 1,
1685            Token::IndentClose => {
1686                indent = indent.saturating_sub(1);
1687                if !at_line_start {
1688                    bytes.push(b'\n');
1689                    at_line_start = true;
1690                }
1691            }
1692            Token::LineBreak => {
1693                if !at_line_start {
1694                    bytes.push(b'\n');
1695                    at_line_start = true;
1696                }
1697            }
1698            Token::Lit(value) => {
1699                if at_line_start {
1700                    bytes.extend(std::iter::repeat_n(b' ', indent * policy.indent_width));
1701                } else if let Some(prev) = last_lit {
1702                    if needs_space_between(prev, value) {
1703                        bytes.push(b' ');
1704                    }
1705                }
1706                bytes.extend_from_slice(value.as_bytes());
1707                at_line_start = false;
1708                last_lit = Some(value.as_str());
1709            }
1710        }
1711    }
1712
1713    if !at_line_start {
1714        bytes.push(b'\n');
1715    }
1716    bytes
1717}
1718
1719fn needs_space_between(last: &str, next: &str) -> bool {
1720    if last.is_empty() || next.is_empty() {
1721        return false;
1722    }
1723    if is_punct_open(last) || is_punct_open(next) {
1724        return false;
1725    }
1726    if is_punct_close(next) {
1727        return false;
1728    }
1729    if is_punct_close(last) && is_punct_punctuation(next) {
1730        return false;
1731    }
1732    if last == "." || next == "." {
1733        return false;
1734    }
1735    if last_is_word_like(last) && first_is_word_like(next) {
1736        return true;
1737    }
1738    if last_ends_with_alnum(last) && first_is_alnum_or_underscore(next) {
1739        return true;
1740    }
1741    // Adjacent operator runs: keep them apart so the lexer doesn't glue
1742    // `>` and `=` into `>=` unintentionally.
1743    true
1744}
1745
1746fn is_punct_open(s: &str) -> bool {
1747    matches!(s, "(" | "[" | "{" | "\"" | "'" | "`")
1748}
1749
1750fn is_punct_close(s: &str) -> bool {
1751    matches!(s, ")" | "]" | "}" | "," | ";" | ":" | "\"" | "'" | "`")
1752}
1753
1754fn is_punct_punctuation(s: &str) -> bool {
1755    matches!(s, "," | ";" | ":" | "." | ")" | "]" | "}")
1756}
1757
1758fn last_is_word_like(s: &str) -> bool {
1759    s.chars()
1760        .next_back()
1761        .map(|c| c.is_alphanumeric() || c == '_')
1762        .unwrap_or(false)
1763}
1764
1765fn first_is_word_like(s: &str) -> bool {
1766    s.chars()
1767        .next()
1768        .map(|c| c.is_alphanumeric() || c == '_')
1769        .unwrap_or(false)
1770}
1771
1772fn last_ends_with_alnum(s: &str) -> bool {
1773    s.chars()
1774        .next_back()
1775        .map(char::is_alphanumeric)
1776        .unwrap_or(false)
1777}
1778
1779fn first_is_alnum_or_underscore(s: &str) -> bool {
1780    s.chars()
1781        .next()
1782        .map(|c| c.is_alphanumeric() || c == '_')
1783        .unwrap_or(false)
1784}
1785
1786#[cfg(test)]
1787mod tests {
1788    use super::*;
1789
1790    #[test]
1791    fn parses_simple_grammar_json() {
1792        let bytes = br#"{
1793            "name": "tiny",
1794            "rules": {
1795                "program": {
1796                    "type": "SEQ",
1797                    "members": [
1798                        {"type": "STRING", "value": "hello"},
1799                        {"type": "STRING", "value": ";"}
1800                    ]
1801                }
1802            }
1803        }"#;
1804        let g = Grammar::from_bytes("tiny", bytes).expect("valid tiny grammar");
1805        assert!(g.rules.contains_key("program"));
1806    }
1807
1808    #[test]
1809    fn output_emits_punctuation_without_leading_space() {
1810        let policy = FormatPolicy::default();
1811        let mut out = Output::new(&policy);
1812        out.token("foo");
1813        out.token("(");
1814        out.token(")");
1815        out.token(";");
1816        let bytes = out.finish();
1817        let s = std::str::from_utf8(&bytes).expect("ascii output");
1818        assert!(s.starts_with("foo();"), "got {s:?}");
1819    }
1820
1821    #[test]
1822    fn grammar_from_bytes_rejects_malformed_input() {
1823        let result = Grammar::from_bytes("malformed", b"not json");
1824        let err = result.expect_err("malformed bytes must yield Err");
1825        let msg = err.to_string();
1826        assert!(
1827            msg.contains("malformed"),
1828            "error message should name the protocol: {msg:?}"
1829        );
1830    }
1831
1832    #[test]
1833    fn output_indents_after_open_brace() {
1834        let policy = FormatPolicy::default();
1835        let mut out = Output::new(&policy);
1836        out.token("fn");
1837        out.token("foo");
1838        out.token("(");
1839        out.token(")");
1840        out.token("{");
1841        out.token("body");
1842        out.token("}");
1843        let bytes = out.finish();
1844        let s = std::str::from_utf8(&bytes).expect("ascii output");
1845        assert!(s.contains("{\n"), "newline after opening brace: {s:?}");
1846        assert!(s.contains("body"), "body inside block: {s:?}");
1847        assert!(s.ends_with("}\n"), "newline after closing brace: {s:?}");
1848    }
1849
1850    #[test]
1851    fn output_no_space_between_word_and_dot() {
1852        let policy = FormatPolicy::default();
1853        let mut out = Output::new(&policy);
1854        out.token("foo");
1855        out.token(".");
1856        out.token("bar");
1857        let bytes = out.finish();
1858        let s = std::str::from_utf8(&bytes).expect("ascii output");
1859        assert!(s.starts_with("foo.bar"), "no space around dot: {s:?}");
1860    }
1861
1862    #[test]
1863    fn output_snapshot_restore_truncates_bytes() {
1864        let policy = FormatPolicy::default();
1865        let mut out = Output::new(&policy);
1866        out.token("keep");
1867        let snap = out.snapshot();
1868        out.token("drop");
1869        out.token("more");
1870        out.restore(snap);
1871        out.token("after");
1872        let bytes = out.finish();
1873        let s = std::str::from_utf8(&bytes).expect("ascii output");
1874        assert!(s.contains("keep"), "kept token survives: {s:?}");
1875        assert!(s.contains("after"), "post-restore token visible: {s:?}");
1876        assert!(!s.contains("drop"), "rolled-back token removed: {s:?}");
1877        assert!(!s.contains("more"), "rolled-back token removed: {s:?}");
1878    }
1879
1880    #[test]
1881    fn child_cursor_take_field_consumes_once() {
1882        let edges_owned: Vec<Edge> = vec![Edge {
1883            src: panproto_gat::Name::from("p"),
1884            tgt: panproto_gat::Name::from("c"),
1885            kind: panproto_gat::Name::from("name"),
1886            name: None,
1887        }];
1888        let edges: Vec<&Edge> = edges_owned.iter().collect();
1889        let mut cursor = ChildCursor::new(&edges);
1890        let first = cursor.take_field("name");
1891        let second = cursor.take_field("name");
1892        assert!(first.is_some(), "first take returns the edge");
1893        assert!(
1894            second.is_none(),
1895            "second take returns None (already consumed)"
1896        );
1897    }
1898
1899    #[test]
1900    fn child_cursor_take_matching_predicate() {
1901        let edges_owned: Vec<Edge> = vec![
1902            Edge {
1903                src: "p".into(),
1904                tgt: "c1".into(),
1905                kind: "child_of".into(),
1906                name: None,
1907            },
1908            Edge {
1909                src: "p".into(),
1910                tgt: "c2".into(),
1911                kind: "key".into(),
1912                name: None,
1913            },
1914        ];
1915        let edges: Vec<&Edge> = edges_owned.iter().collect();
1916        let mut cursor = ChildCursor::new(&edges);
1917        assert!(cursor.has_matching(|e| e.kind.as_ref() == "key"));
1918        let taken = cursor.take_matching(|e| e.kind.as_ref() == "key");
1919        assert!(taken.is_some());
1920        assert!(
1921            !cursor.has_matching(|e| e.kind.as_ref() == "key"),
1922            "consumed edge no longer matches"
1923        );
1924        assert!(
1925            cursor.has_matching(|e| e.kind.as_ref() == "child_of"),
1926            "the other edge is still available"
1927        );
1928    }
1929
1930    #[test]
1931    fn kind_satisfies_symbol_direct_match() {
1932        let bytes = br#"{
1933            "name": "tiny",
1934            "rules": {
1935                "x": {"type": "STRING", "value": "x"}
1936            }
1937        }"#;
1938        let g = Grammar::from_bytes("tiny", bytes).expect("valid grammar");
1939        assert!(kind_satisfies_symbol(&g, Some("x"), "x"));
1940        assert!(!kind_satisfies_symbol(&g, Some("y"), "x"));
1941        assert!(!kind_satisfies_symbol(&g, None, "x"));
1942    }
1943
1944    #[test]
1945    fn kind_satisfies_symbol_through_hidden_rule() {
1946        let bytes = br#"{
1947            "name": "tiny",
1948            "rules": {
1949                "_value": {
1950                    "type": "CHOICE",
1951                    "members": [
1952                        {"type": "SYMBOL", "name": "object"},
1953                        {"type": "SYMBOL", "name": "number"}
1954                    ]
1955                },
1956                "object": {"type": "STRING", "value": "{}"},
1957                "number": {"type": "PATTERN", "value": "[0-9]+"}
1958            }
1959        }"#;
1960        let g = Grammar::from_bytes("tiny", bytes).expect("valid grammar");
1961        assert!(
1962            kind_satisfies_symbol(&g, Some("number"), "_value"),
1963            "number is reachable from _value via CHOICE"
1964        );
1965        assert!(
1966            kind_satisfies_symbol(&g, Some("object"), "_value"),
1967            "object is reachable from _value via CHOICE"
1968        );
1969        assert!(
1970            !kind_satisfies_symbol(&g, Some("string"), "_value"),
1971            "string is NOT among the alternatives"
1972        );
1973    }
1974
1975    #[test]
1976    fn first_symbol_skips_string_terminals() {
1977        let prod: Production = serde_json::from_str(
1978            r#"{
1979                "type": "SEQ",
1980                "members": [
1981                    {"type": "STRING", "value": "{"},
1982                    {"type": "SYMBOL", "name": "body"},
1983                    {"type": "STRING", "value": "}"}
1984                ]
1985            }"#,
1986        )
1987        .expect("valid SEQ");
1988        assert_eq!(first_symbol(&prod), Some("body"));
1989    }
1990
1991    #[test]
1992    fn placeholder_for_pattern_routes_by_regex_class() {
1993        assert_eq!(placeholder_for_pattern("[0-9]+"), "0");
1994        assert_eq!(placeholder_for_pattern("[a-zA-Z_]\\w*"), "_x");
1995        assert_eq!(placeholder_for_pattern("\"[^\"]*\""), "\"\"");
1996        assert_eq!(placeholder_for_pattern("\\d+\\.\\d+"), "0");
1997    }
1998
1999    #[test]
2000    fn format_policy_default_breaks_after_semicolon() {
2001        let policy = FormatPolicy::default();
2002        assert!(policy.line_break_after.iter().any(|t| t == ";"));
2003        assert!(policy.indent_open.iter().any(|t| t == "{"));
2004        assert!(policy.indent_close.iter().any(|t| t == "}"));
2005        assert_eq!(policy.indent_width, 2);
2006    }
2007
2008    #[test]
2009    fn placeholder_decodes_literal_pattern_separators() {
2010        // PATTERN regexes that match a single literal byte sequence
2011        // (newline, semicolon, comma) emit the bytes verbatim instead
2012        // of falling through to the `_` catch-all.
2013        assert_eq!(placeholder_for_pattern("\\n"), "\n");
2014        assert_eq!(placeholder_for_pattern("\\r\\n"), "\r\n");
2015        assert_eq!(placeholder_for_pattern(";"), ";");
2016        // Patterns with character classes / alternation still route
2017        // through the heuristic.
2018        assert_eq!(placeholder_for_pattern("[0-9]+"), "0");
2019        assert_eq!(placeholder_for_pattern("a|b"), "_");
2020    }
2021
2022    #[test]
2023    fn supertypes_decode_from_grammar_json_strings() {
2024        // Tree-sitter older grammars list supertypes as bare strings.
2025        let bytes = br#"{
2026            "name": "tiny",
2027            "supertypes": ["expression"],
2028            "rules": {
2029                "expression": {
2030                    "type": "CHOICE",
2031                    "members": [
2032                        {"type": "SYMBOL", "name": "binary_expression"},
2033                        {"type": "SYMBOL", "name": "identifier"}
2034                    ]
2035                },
2036                "binary_expression": {"type": "STRING", "value": "x"},
2037                "identifier": {"type": "PATTERN", "value": "[a-z]+"}
2038            }
2039        }"#;
2040        let g = Grammar::from_bytes("tiny", bytes).expect("parse");
2041        assert!(g.supertypes.contains("expression"));
2042        // identifier matches the supertype `expression`.
2043        assert!(kind_satisfies_symbol(&g, Some("identifier"), "expression"));
2044        // unrelated kinds do not.
2045        assert!(!kind_satisfies_symbol(&g, Some("string"), "expression"));
2046    }
2047
2048    #[test]
2049    fn supertypes_decode_from_grammar_json_objects() {
2050        // Recent grammars list supertypes as `{type: SYMBOL, name: ...}`
2051        // entries instead of bare strings.
2052        let bytes = br#"{
2053            "name": "tiny",
2054            "supertypes": [{"type": "SYMBOL", "name": "stmt"}],
2055            "rules": {
2056                "stmt": {
2057                    "type": "CHOICE",
2058                    "members": [
2059                        {"type": "SYMBOL", "name": "while_stmt"},
2060                        {"type": "SYMBOL", "name": "if_stmt"}
2061                    ]
2062                },
2063                "while_stmt": {"type": "STRING", "value": "while"},
2064                "if_stmt": {"type": "STRING", "value": "if"}
2065            }
2066        }"#;
2067        let g = Grammar::from_bytes("tiny", bytes).expect("parse");
2068        assert!(g.supertypes.contains("stmt"));
2069        assert!(kind_satisfies_symbol(&g, Some("while_stmt"), "stmt"));
2070    }
2071
2072    #[test]
2073    fn alias_value_matches_kind() {
2074        // A named ALIAS rewrites the parser-visible kind to `value`;
2075        // `kind_satisfies_symbol` should accept that rewritten kind
2076        // when looking up the original SYMBOL.
2077        let bytes = br#"{
2078            "name": "tiny",
2079            "rules": {
2080                "_package_identifier": {
2081                    "type": "ALIAS",
2082                    "named": true,
2083                    "value": "package_identifier",
2084                    "content": {"type": "SYMBOL", "name": "identifier"}
2085                },
2086                "identifier": {"type": "PATTERN", "value": "[a-z]+"}
2087            }
2088        }"#;
2089        let g = Grammar::from_bytes("tiny", bytes).expect("parse");
2090        assert!(kind_satisfies_symbol(
2091            &g,
2092            Some("package_identifier"),
2093            "_package_identifier"
2094        ));
2095    }
2096
2097    #[test]
2098    fn referenced_symbols_walks_nested_seq() {
2099        let prod: Production = serde_json::from_str(
2100            r#"{
2101                "type": "SEQ",
2102                "members": [
2103                    {"type": "CHOICE", "members": [
2104                        {"type": "SYMBOL", "name": "attribute_item"},
2105                        {"type": "BLANK"}
2106                    ]},
2107                    {"type": "SYMBOL", "name": "parameter"},
2108                    {"type": "REPEAT", "content": {
2109                        "type": "SEQ",
2110                        "members": [
2111                            {"type": "STRING", "value": ","},
2112                            {"type": "SYMBOL", "name": "parameter"}
2113                        ]
2114                    }}
2115                ]
2116            }"#,
2117        )
2118        .expect("seq");
2119        let symbols = referenced_symbols(&prod);
2120        assert!(symbols.contains(&"attribute_item"));
2121        assert!(symbols.contains(&"parameter"));
2122    }
2123
2124    #[test]
2125    fn literal_strings_collects_choice_members() {
2126        let prod: Production = serde_json::from_str(
2127            r#"{
2128                "type": "CHOICE",
2129                "members": [
2130                    {"type": "STRING", "value": "+"},
2131                    {"type": "STRING", "value": "-"},
2132                    {"type": "STRING", "value": "*"}
2133                ]
2134            }"#,
2135        )
2136        .expect("choice");
2137        let strings = literal_strings(&prod);
2138        assert_eq!(strings, vec!["+", "-", "*"]);
2139    }
2140
2141    /// The ocaml and javascript grammars (tree-sitter ≥ 0.25) emit a
2142    /// `RESERVED` rule kind that an earlier deserialiser rejected
2143    /// with `unknown variant "RESERVED"`. Verify both that the bare
2144    /// variant deserialises and that a `RESERVED`-wrapped grammar is
2145    /// loadable end-to-end via [`Grammar::from_bytes`].
2146    #[test]
2147    fn reserved_variant_deserialises() {
2148        let prod: Production = serde_json::from_str(
2149            r#"{
2150                "type": "RESERVED",
2151                "content": {"type": "SYMBOL", "name": "_lowercase_identifier"},
2152                "context_name": "attribute_id"
2153            }"#,
2154        )
2155        .expect("RESERVED parses");
2156        match prod {
2157            Production::Reserved { content, .. } => match *content {
2158                Production::Symbol { name } => assert_eq!(name, "_lowercase_identifier"),
2159                other => panic!("expected inner SYMBOL, got {other:?}"),
2160            },
2161            other => panic!("expected RESERVED, got {other:?}"),
2162        }
2163    }
2164
2165    #[test]
2166    fn reserved_grammar_loads_end_to_end() {
2167        let bytes = br#"{
2168            "name": "tiny_reserved",
2169            "rules": {
2170                "program": {
2171                    "type": "RESERVED",
2172                    "content": {"type": "SYMBOL", "name": "ident"},
2173                    "context_name": "keywords"
2174                },
2175                "ident": {"type": "PATTERN", "value": "[a-z]+"}
2176            }
2177        }"#;
2178        let g = Grammar::from_bytes("tiny_reserved", bytes).expect("RESERVED-using grammar loads");
2179        assert!(g.rules.contains_key("program"));
2180    }
2181
2182    #[test]
2183    fn reserved_walker_helpers_recurse_into_content() {
2184        // The walker's helpers (first_symbol, has_field_in,
2185        // referenced_symbols, ...) all need to descend through
2186        // RESERVED into its content. If they bail at RESERVED, the
2187        // `pick_choice_with_cursor` heuristic ranks the alt below
2188        // alts that DO recurse, which produces wrong emit output
2189        // even when the deserialiser doesn't crash.
2190        let prod: Production = serde_json::from_str(
2191            r#"{
2192                "type": "RESERVED",
2193                "content": {
2194                    "type": "FIELD",
2195                    "name": "lhs",
2196                    "content": {"type": "SYMBOL", "name": "expr"}
2197                },
2198                "context_name": "ctx"
2199            }"#,
2200        )
2201        .expect("nested RESERVED parses");
2202        assert_eq!(first_symbol(&prod), Some("expr"));
2203        assert!(has_field_in(&prod, &["lhs"]));
2204        let symbols = referenced_symbols(&prod);
2205        assert!(symbols.contains(&"expr"));
2206    }
2207}