panproto_parse/
emit_pretty.rs

1#![allow(
2    clippy::module_name_repetitions,
3    clippy::too_many_lines,
4    clippy::too_many_arguments,
5    clippy::map_unwrap_or,
6    clippy::option_if_let_else,
7    clippy::elidable_lifetime_names,
8    clippy::items_after_statements,
9    clippy::needless_pass_by_value,
10    clippy::single_match_else,
11    clippy::manual_let_else,
12    clippy::match_same_arms,
13    clippy::missing_const_for_fn,
14    clippy::single_char_pattern,
15    clippy::naive_bytecount,
16    clippy::expect_used,
17    clippy::redundant_pub_crate,
18    clippy::used_underscore_binding,
19    clippy::redundant_field_names,
20    clippy::struct_field_names,
21    clippy::redundant_else,
22    clippy::similar_names
23)]
24
25//! De-novo source emission from a by-construction schema.
26//!
27//! [`AstParser::emit`] reconstructs source from byte-position fragments
28//! that the parser stored on the schema during `parse`. That works for
29//! edit pipelines (`parse → transform → emit`) but fails for schemas
30//! built by hand (`SchemaBuilder` with no parse history): they carry
31//! no `start-byte`, no `interstitial-N`, no `literal-value`, and the
32//! reconstructor returns `Err(EmitFailed { reason: "schema has no
33//! text fragments" })`.
34//!
35//! This module renders such schemas to source bytes by walking
36//! tree-sitter's `grammar.json` production rules. For each schema
37//! vertex of kind `K`, the walker looks up `K`'s production in the
38//! grammar and emits its body in order:
39//!
40//! - `STRING` nodes contribute literal token bytes directly.
41//! - `SYMBOL` and `FIELD` nodes recurse into the schema's children,
42//!   matching by edge kind (which is the tree-sitter field name).
43//! - `SEQ` emits its members in order.
44//! - `CHOICE` picks the alternative whose head `SYMBOL` matches an
45//!   actual child kind, or whose terminals appear in the rendered
46//!   prefix; falls back to the first non-`BLANK` alternative when no
47//!   alternative matches.
48//! - `REPEAT` and `REPEAT1` emit their content once per matching
49//!   child edge in declared order.
50//! - `OPTIONAL` emits its content iff a corresponding child edge or
51//!   constraint is populated.
52//! - `PATTERN` is a regex placeholder for variable-text terminals
53//!   (identifiers, numbers, quoted strings). The walker emits a
54//!   `literal-value` constraint when present and otherwise falls
55//!   back to a placeholder derived from the regex shape.
56//! - `BLANK`, `TOKEN`, `IMMEDIATE_TOKEN`, `ALIAS`, `PREC*` are
57//!   handled transparently (the inner content is emitted; the
58//!   wrapper is dropped).
59//!
60//! Whitespace and indentation come from a `FormatPolicy` applied
61//! during emission. The default policy inserts a single space between
62//! adjacent tokens, a newline after `;` / `}` / `{`, and tracks an
63//! indent counter on `{` / `}` boundaries.
64//!
65//! Output is *syntactically valid* for any grammar that ships
66//! `grammar.json`. Idiomatic formatting (rustfmt-style spacing rules,
67//! per-language conventions) is a polish layer that lives outside
68//! this module.
69
70use std::collections::BTreeMap;
71
72use panproto_schema::{Edge, Schema};
73use serde::Deserialize;
74
75use crate::error::ParseError;
76
77// ═══════════════════════════════════════════════════════════════════
78// Grammar JSON model
79// ═══════════════════════════════════════════════════════════════════
80
81/// A single tree-sitter production rule.
82///
83/// Mirrors the shape emitted by `tree-sitter generate`: every node has
84/// a `type` discriminator that selects a structural variant. The
85/// untyped subset (`PATTERN`, `STRING`, `SYMBOL`, `BLANK`) handles
86/// terminals; the structural subset (`SEQ`, `CHOICE`, `REPEAT`,
87/// `REPEAT1`, `OPTIONAL`, `FIELD`, `ALIAS`, `TOKEN`,
88/// `IMMEDIATE_TOKEN`, `PREC*`) builds composite productions.
89#[derive(Debug, Clone, Deserialize)]
90#[serde(tag = "type")]
91#[non_exhaustive]
92pub enum Production {
93    /// Concatenation of productions.
94    #[serde(rename = "SEQ")]
95    Seq {
96        /// Ordered members; each is emitted in turn.
97        members: Vec<Self>,
98    },
99    /// Alternation between productions.
100    #[serde(rename = "CHOICE")]
101    Choice {
102        /// Alternatives; the walker picks one based on the schema's
103        /// children and constraints.
104        members: Vec<Self>,
105    },
106    /// Zero-or-more repetition.
107    #[serde(rename = "REPEAT")]
108    Repeat {
109        /// The repeated body.
110        content: Box<Self>,
111    },
112    /// One-or-more repetition.
113    #[serde(rename = "REPEAT1")]
114    Repeat1 {
115        /// The repeated body.
116        content: Box<Self>,
117    },
118    /// Optional inclusion (zero or one).
119    ///
120    /// Tree-sitter usually emits `OPTIONAL` as `CHOICE { content,
121    /// BLANK }`, but recent generator versions also emit explicit
122    /// `OPTIONAL` nodes; both shapes are accepted.
123    #[serde(rename = "OPTIONAL")]
124    Optional {
125        /// The optional body.
126        content: Box<Self>,
127    },
128    /// Reference to another rule by name.
129    #[serde(rename = "SYMBOL")]
130    Symbol {
131        /// Name of the referenced rule (matches a vertex kind on the
132        /// schema side).
133        name: String,
134    },
135    /// Literal token bytes.
136    #[serde(rename = "STRING")]
137    String {
138        /// The literal token. Emitted verbatim.
139        value: String,
140    },
141    /// Regex-matched terminal.
142    ///
143    /// At parse time this matches arbitrary bytes; at emit time the
144    /// walker substitutes a `literal-value` constraint when present
145    /// and falls back to a placeholder otherwise.
146    #[serde(rename = "PATTERN")]
147    Pattern {
148        /// The original regex.
149        value: String,
150    },
151    /// The empty production. Emits nothing.
152    #[serde(rename = "BLANK")]
153    Blank,
154    /// Named field over a content production.
155    ///
156    /// The field `name` matches an edge kind on the schema side; the
157    /// walker resolves the corresponding child vertex and recurses
158    /// into `content` with that child as context.
159    #[serde(rename = "FIELD")]
160    Field {
161        /// Field name (matches edge kind).
162        name: String,
163        /// The contents of the field.
164        content: Box<Self>,
165    },
166    /// An aliased production.
167    ///
168    /// `value` records the parser-visible kind; the walker emits
169    /// `content` and ignores the alias rename.
170    #[serde(rename = "ALIAS")]
171    Alias {
172        /// The aliased content.
173        content: Box<Self>,
174        /// Whether the alias is a named node.
175        #[serde(default)]
176        named: bool,
177        /// The alias's surface name.
178        #[serde(default)]
179        value: String,
180    },
181    /// A token wrapper.
182    ///
183    /// Tree-sitter uses `TOKEN` to mark a sub-rule as a single
184    /// lexical token; the walker emits the inner content unchanged.
185    #[serde(rename = "TOKEN")]
186    Token {
187        /// The wrapped content.
188        content: Box<Self>,
189    },
190    /// An immediate-token wrapper (no preceding whitespace).
191    ///
192    /// Treated like [`Production::Token`] for emit purposes.
193    #[serde(rename = "IMMEDIATE_TOKEN")]
194    ImmediateToken {
195        /// The wrapped content.
196        content: Box<Self>,
197    },
198    /// Precedence wrapper.
199    #[serde(rename = "PREC")]
200    Prec {
201        /// Precedence value (numeric or string). Ignored at emit time.
202        #[allow(dead_code)]
203        value: serde_json::Value,
204        /// The wrapped content.
205        content: Box<Self>,
206    },
207    /// Left-associative precedence wrapper.
208    #[serde(rename = "PREC_LEFT")]
209    PrecLeft {
210        /// Precedence value. Ignored at emit time.
211        #[allow(dead_code)]
212        value: serde_json::Value,
213        /// The wrapped content.
214        content: Box<Self>,
215    },
216    /// Right-associative precedence wrapper.
217    #[serde(rename = "PREC_RIGHT")]
218    PrecRight {
219        /// Precedence value. Ignored at emit time.
220        #[allow(dead_code)]
221        value: serde_json::Value,
222        /// The wrapped content.
223        content: Box<Self>,
224    },
225    /// Dynamic precedence wrapper.
226    #[serde(rename = "PREC_DYNAMIC")]
227    PrecDynamic {
228        /// Precedence value. Ignored at emit time.
229        #[allow(dead_code)]
230        value: serde_json::Value,
231        /// The wrapped content.
232        content: Box<Self>,
233    },
234    /// Reserved-word wrapper (tree-sitter ≥ 0.25).
235    ///
236    /// Tree-sitter's `RESERVED` rule marks an inner production as a
237    /// reserved-word context: the parser excludes the listed identifiers
238    /// from being treated as the inner symbol. The `context_name`
239    /// metadata names the reserved-word set; the emitter does not need
240    /// it (we are walking schema → bytes, not enforcing reserved-word
241    /// constraints), so we emit the inner content unchanged, the same
242    /// way [`Production::Token`] and [`Production::ImmediateToken`] do.
243    #[serde(rename = "RESERVED")]
244    Reserved {
245        /// The wrapped content.
246        content: Box<Self>,
247        /// Name of the reserved-word context. Ignored at emit time.
248        #[allow(dead_code)]
249        #[serde(default)]
250        context_name: String,
251    },
252}
253
254/// A grammar's production-rule table, deserialized from `grammar.json`.
255///
256/// Only the fields the emitter consumes are decoded; precedences,
257/// conflicts, externals, and other parser-only metadata are ignored.
258#[derive(Debug, Clone, Deserialize)]
259pub struct Grammar {
260    /// Grammar name (e.g. `"rust"`, `"typescript"`).
261    #[allow(dead_code)]
262    pub name: String,
263    /// Map from rule name (a vertex kind on the schema side) to
264    /// production. Entries are kept in lexical order so iteration
265    /// is deterministic.
266    pub rules: BTreeMap<String, Production>,
267    /// Supertypes declared in the grammar's `supertypes` field. A
268    /// supertype is a rule whose body is a `CHOICE` of `SYMBOL`
269    /// references; tree-sitter parsers report a node's kind as one
270    /// of the subtypes (e.g. `identifier`, `typed_parameter`) rather
271    /// than the supertype name (`parameter`), so the emitter needs to
272    /// know that a child kind in a subtype set should match the
273    /// supertype name when a SYMBOL references it.
274    #[serde(default, deserialize_with = "deserialize_supertypes")]
275    pub supertypes: std::collections::HashSet<String>,
276    /// Precomputed subtyping closure: `subtypes[symbol_name]` is the
277    /// set of vertex kinds that satisfy a SYMBOL `symbol_name`
278    /// reference on the schema side.
279    ///
280    /// Built once at [`Grammar::from_bytes`] time by walking each
281    /// hidden rule (`_`-prefixed), declared supertype, and named
282    /// `ALIAS { value: K, ... }` production to its leaf SYMBOLs and
283    /// recording the closure. This replaces the prior heuristic
284    /// `kind_satisfies_symbol` that walked the rule body on every
285    /// query: lookups are now O(1) and the relation is exactly the
286    /// transitive closure of "is reachable via hidden / supertype /
287    /// alias dispatch", with no over-expansion through non-hidden
288    /// non-supertype rule references.
289    #[serde(skip)]
290    pub subtypes: std::collections::HashMap<String, std::collections::HashSet<String>>,
291}
292
293fn deserialize_supertypes<'de, D>(
294    deserializer: D,
295) -> Result<std::collections::HashSet<String>, D::Error>
296where
297    D: serde::Deserializer<'de>,
298{
299    let entries: Vec<serde_json::Value> = Vec::deserialize(deserializer)?;
300    let mut out = std::collections::HashSet::new();
301    for entry in entries {
302        match entry {
303            serde_json::Value::String(s) => {
304                out.insert(s);
305            }
306            serde_json::Value::Object(map) => {
307                if let Some(serde_json::Value::String(name)) = map.get("name") {
308                    out.insert(name.clone());
309                }
310            }
311            _ => {}
312        }
313    }
314    Ok(out)
315}
316
317impl Grammar {
318    /// Parse a grammar's `grammar.json` bytes.
319    ///
320    /// Builds the subtyping closure as part of construction so every
321    /// downstream lookup is O(1). The closure is the least relation
322    /// containing `(K, K)` for every rule key `K` and closed under:
323    ///
324    /// - hidden-rule expansion: if `S` is hidden and a SYMBOL `S` may
325    ///   reach SYMBOL `K`, then `K ⊑ S`.
326    /// - supertype expansion: if `S` is in the grammar's supertypes
327    ///   block and `K` is one of `S`'s alternatives, then `K ⊑ S`.
328    /// - alias renaming: if a rule body contains
329    ///   `ALIAS { content: SYMBOL R, value: A, named: true }` where
330    ///   `R` reaches kind `K` (or `K = R` when no further hop), then
331    ///   `A ⊑ R` and `K ⊑ A`.
332    ///
333    /// # Errors
334    ///
335    /// Returns [`ParseError::EmitFailed`] when the bytes are not a
336    /// valid `grammar.json` document.
337    pub fn from_bytes(protocol: &str, bytes: &[u8]) -> Result<Self, ParseError> {
338        let mut grammar: Self =
339            serde_json::from_slice(bytes).map_err(|e| ParseError::EmitFailed {
340                protocol: protocol.to_owned(),
341                reason: format!("grammar.json deserialization failed: {e}"),
342            })?;
343        grammar.subtypes = compute_subtype_closure(&grammar);
344        Ok(grammar)
345    }
346}
347
348/// Compute the subtyping relation as a forward-indexed map from a
349/// SYMBOL name to the set of vertex kinds that satisfy that SYMBOL.
350fn compute_subtype_closure(
351    grammar: &Grammar,
352) -> std::collections::HashMap<String, std::collections::HashSet<String>> {
353    use std::collections::{HashMap, HashSet};
354    // Edges of the "kind X satisfies SYMBOL Y" relation. `K ⊑ Y` is
355    // recorded whenever Y is reached by walking the grammar's
356    // ALIAS / hidden-rule / supertype dispatch from a position where
357    // K is the actual vertex kind.
358    let mut subtypes: HashMap<String, HashSet<String>> = HashMap::new();
359    for name in grammar.rules.keys() {
360        subtypes
361            .entry(name.clone())
362            .or_default()
363            .insert(name.clone());
364    }
365
366    // First pass: collect the immediate "satisfies" edges from each
367    // expandable rule (hidden, supertype) to the kinds reachable by
368    // walking its body, plus alias edges.
369    fn walk<'g>(
370        grammar: &'g Grammar,
371        production: &'g Production,
372        visited: &mut HashSet<&'g str>,
373        out: &mut HashSet<String>,
374    ) {
375        match production {
376            Production::Symbol { name } => {
377                // Direct subtype.
378                out.insert(name.clone());
379                // Continue expansion through hidden / supertype rules
380                // so the closure traverses pass-through dispatch.
381                let expand = name.starts_with('_') || grammar.supertypes.contains(name.as_str());
382                if expand && visited.insert(name.as_str()) {
383                    if let Some(rule) = grammar.rules.get(name) {
384                        walk(grammar, rule, visited, out);
385                    }
386                }
387            }
388            Production::Choice { members } | Production::Seq { members } => {
389                for m in members {
390                    walk(grammar, m, visited, out);
391                }
392            }
393            Production::Alias {
394                content,
395                named,
396                value,
397            } => {
398                if *named && !value.is_empty() {
399                    out.insert(value.clone());
400                }
401                walk(grammar, content, visited, out);
402            }
403            Production::Repeat { content }
404            | Production::Repeat1 { content }
405            | Production::Optional { content }
406            | Production::Field { content, .. }
407            | Production::Token { content }
408            | Production::ImmediateToken { content }
409            | Production::Prec { content, .. }
410            | Production::PrecLeft { content, .. }
411            | Production::PrecRight { content, .. }
412            | Production::PrecDynamic { content, .. }
413            | Production::Reserved { content, .. } => {
414                walk(grammar, content, visited, out);
415            }
416            _ => {}
417        }
418    }
419
420    for (name, rule) in &grammar.rules {
421        let expand = name.starts_with('_') || grammar.supertypes.contains(name.as_str());
422        if !expand {
423            continue;
424        }
425        let mut visited: HashSet<&str> = HashSet::new();
426        visited.insert(name.as_str());
427        let mut reachable: HashSet<String> = HashSet::new();
428        walk(grammar, rule, &mut visited, &mut reachable);
429        for kind in &reachable {
430            subtypes
431                .entry(kind.clone())
432                .or_default()
433                .insert(name.clone());
434        }
435    }
436
437    // Aliases: scan every rule body for ALIAS { content, value }
438    // declarations. The kinds reachable from `content` satisfy
439    // `value`, AND (by construction) `value` satisfies the
440    // surrounding rule. Walking the ENTIRE grammar once captures
441    // every alias site, irrespective of which rule introduces it.
442    fn collect_aliases<'g>(production: &'g Production, out: &mut Vec<(String, &'g Production)>) {
443        match production {
444            Production::Alias {
445                content,
446                named,
447                value,
448            } => {
449                if *named && !value.is_empty() {
450                    out.push((value.clone(), content.as_ref()));
451                }
452                collect_aliases(content, out);
453            }
454            Production::Choice { members } | Production::Seq { members } => {
455                for m in members {
456                    collect_aliases(m, out);
457                }
458            }
459            Production::Repeat { content }
460            | Production::Repeat1 { content }
461            | Production::Optional { content }
462            | Production::Field { content, .. }
463            | Production::Token { content }
464            | Production::ImmediateToken { content }
465            | Production::Prec { content, .. }
466            | Production::PrecLeft { content, .. }
467            | Production::PrecRight { content, .. }
468            | Production::PrecDynamic { content, .. }
469            | Production::Reserved { content, .. } => {
470                collect_aliases(content, out);
471            }
472            _ => {}
473        }
474    }
475    let mut aliases: Vec<(String, &Production)> = Vec::new();
476    for rule in grammar.rules.values() {
477        collect_aliases(rule, &mut aliases);
478    }
479    for (alias_value, content) in aliases {
480        let mut visited: HashSet<&str> = HashSet::new();
481        let mut reachable: HashSet<String> = HashSet::new();
482        walk(grammar, content, &mut visited, &mut reachable);
483        // Aliased value satisfies itself and is satisfied by every
484        // kind its content can reach.
485        subtypes
486            .entry(alias_value.clone())
487            .or_default()
488            .insert(alias_value.clone());
489        for kind in reachable {
490            subtypes
491                .entry(kind)
492                .or_default()
493                .insert(alias_value.clone());
494        }
495    }
496
497    // Transitive close: `K ⊑ A` and `A ⊑ B` implies `K ⊑ B`. Iterate
498    // a few rounds; the relation is small so a quick fixed-point
499    // suffices in practice.
500    for _ in 0..8 {
501        let snapshot = subtypes.clone();
502        let mut changed = false;
503        for (kind, supers) in &snapshot {
504            let extra: HashSet<String> = supers
505                .iter()
506                .flat_map(|s| snapshot.get(s).cloned().unwrap_or_default())
507                .collect();
508            let entry = subtypes.entry(kind.clone()).or_default();
509            for s in extra {
510                if entry.insert(s) {
511                    changed = true;
512                }
513            }
514        }
515        if !changed {
516            break;
517        }
518    }
519
520    subtypes
521}
522
523// ═══════════════════════════════════════════════════════════════════
524// Format policy
525// ═══════════════════════════════════════════════════════════════════
526
527/// Whitespace and indentation policy applied during emission.
528///
529/// The default policy inserts a single space between adjacent tokens,
530/// a newline after `;` / `}` / `{`, and tracks indent on `{` / `}`
531/// boundaries. Per-language overrides (idiomatic indent width,
532/// trailing-comma rules, blank-line conventions) can ride alongside
533/// this struct in a follow-up branch; today's defaults aim only for
534/// syntactic validity.
535#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
536pub struct FormatPolicy {
537    /// Number of spaces per indent level.
538    pub indent_width: usize,
539    /// Separator inserted between adjacent terminals that the lexer
540    /// would otherwise glue together (word ↔ word, operator ↔ operator).
541    /// Default is a single space.
542    pub separator: String,
543    /// Newline byte sequence emitted after `line_break_after` tokens
544    /// and at end-of-output. Default is `"\n"`.
545    pub newline: String,
546    /// Tokens after which the walker breaks to a new line.
547    pub line_break_after: Vec<String>,
548    /// Tokens that increase indent on emission.
549    pub indent_open: Vec<String>,
550    /// Tokens that decrease indent on emission.
551    pub indent_close: Vec<String>,
552}
553
554impl Default for FormatPolicy {
555    fn default() -> Self {
556        Self {
557            indent_width: 2,
558            separator: " ".to_owned(),
559            newline: "\n".to_owned(),
560            line_break_after: vec![";".into(), "{".into(), "}".into()],
561            indent_open: vec!["{".into()],
562            indent_close: vec!["}".into()],
563        }
564    }
565}
566
567// ═══════════════════════════════════════════════════════════════════
568// Emitter
569// ═══════════════════════════════════════════════════════════════════
570
571/// Emit a by-construction schema to source bytes.
572///
573/// `protocol` is the grammar / language name (used in error messages
574/// and to label the entry point).
575///
576/// The walker treats `schema.entries` as the ordered list of root
577/// vertices, falling back to a deterministic by-id ordering when
578/// `entries` is empty. Each root is emitted using the production
579/// associated with its kind in `grammar.rules`.
580///
581/// # Errors
582///
583/// Returns [`ParseError::EmitFailed`] when:
584///
585/// - the schema has no vertices
586/// - a root vertex's kind is not a grammar rule
587/// - a `SYMBOL` reference points at a kind with no rule and no schema
588///   child to resolve it to
589/// - a required `FIELD` has no corresponding edge in the schema
590pub fn emit_pretty(
591    protocol: &str,
592    schema: &Schema,
593    grammar: &Grammar,
594    policy: &FormatPolicy,
595) -> Result<Vec<u8>, ParseError> {
596    let roots = collect_roots(schema);
597    if roots.is_empty() {
598        return Err(ParseError::EmitFailed {
599            protocol: protocol.to_owned(),
600            reason: "schema has no entry vertices".to_owned(),
601        });
602    }
603
604    let mut out = Output::new(policy);
605    for (i, root) in roots.iter().enumerate() {
606        if i > 0 {
607            out.newline();
608        }
609        emit_vertex(protocol, schema, grammar, root, &mut out)?;
610    }
611    Ok(out.finish())
612}
613
614fn collect_roots(schema: &Schema) -> Vec<&panproto_gat::Name> {
615    if !schema.entries.is_empty() {
616        return schema
617            .entries
618            .iter()
619            .filter(|name| schema.vertices.contains_key(*name))
620            .collect();
621    }
622
623    // Fallback: every vertex that is not the target of any structural edge
624    // (sorted by id for determinism).
625    let mut targets: std::collections::HashSet<&panproto_gat::Name> =
626        std::collections::HashSet::new();
627    for edge in schema.edges.keys() {
628        targets.insert(&edge.tgt);
629    }
630    let mut roots: Vec<&panproto_gat::Name> = schema
631        .vertices
632        .keys()
633        .filter(|name| !targets.contains(name))
634        .collect();
635    roots.sort();
636    roots
637}
638
639fn emit_vertex(
640    protocol: &str,
641    schema: &Schema,
642    grammar: &Grammar,
643    vertex_id: &panproto_gat::Name,
644    out: &mut Output<'_>,
645) -> Result<(), ParseError> {
646    let vertex = schema
647        .vertices
648        .get(vertex_id)
649        .ok_or_else(|| ParseError::EmitFailed {
650            protocol: protocol.to_owned(),
651            reason: format!("vertex '{vertex_id}' not found"),
652        })?;
653
654    // Leaf shortcut: a vertex carrying a `literal-value` constraint
655    // and no outgoing structural edges is a terminal token. Emit the
656    // captured value directly. This handles identifiers, numeric
657    // literals, and string literals that the parser stored as
658    // `literal-value` even on by-construction schemas.
659    if let Some(literal) = literal_value(schema, vertex_id) {
660        if children_for(schema, vertex_id).is_empty() {
661            out.token(literal);
662            return Ok(());
663        }
664    }
665
666    let kind = vertex.kind.as_ref();
667    let edges = children_for(schema, vertex_id);
668    if let Some(rule) = grammar.rules.get(kind) {
669        let mut cursor = ChildCursor::new(&edges);
670        return emit_production(protocol, schema, grammar, vertex_id, rule, &mut cursor, out);
671    }
672
673    // No rule for this kind. The parser produced it via an ALIAS
674    // (tree-sitter's `alias($.something, $.actual_kind)`) or via an
675    // external scanner (e.g. YAML's `document` root). Fall back to
676    // walking the children directly so the inner content survives;
677    // surrounding tokens — whose only source is the missing rule —
678    // are necessarily absent.
679    for edge in &edges {
680        emit_vertex(protocol, schema, grammar, &edge.tgt, out)?;
681    }
682    Ok(())
683}
684
685/// Linear cursor over a vertex's outgoing edges, used to thread
686/// children through a production rule without double-consuming them.
687struct ChildCursor<'a> {
688    edges: &'a [&'a Edge],
689    consumed: Vec<bool>,
690}
691
692impl<'a> ChildCursor<'a> {
693    fn new(edges: &'a [&'a Edge]) -> Self {
694        Self {
695            edges,
696            consumed: vec![false; edges.len()],
697        }
698    }
699
700    /// Take the next unconsumed edge whose kind equals `field_name`.
701    fn take_field(&mut self, field_name: &str) -> Option<&'a Edge> {
702        for (i, edge) in self.edges.iter().enumerate() {
703            if !self.consumed[i] && edge.kind.as_ref() == field_name {
704                self.consumed[i] = true;
705                return Some(edge);
706            }
707        }
708        None
709    }
710
711    /// Whether any unconsumed edge satisfies `predicate`. Used by the
712    /// unit tests; the live emit path went through `has_matching` on
713    /// each alternative until cursor-driven dispatch was rewritten to
714    /// pick the first-unconsumed-edge's kind directly.
715    #[cfg(test)]
716    fn has_matching(&self, predicate: impl Fn(&Edge) -> bool) -> bool {
717        self.edges
718            .iter()
719            .enumerate()
720            .any(|(i, edge)| !self.consumed[i] && predicate(edge))
721    }
722
723    /// Take the next unconsumed edge whose target vertex satisfies
724    /// `predicate`. Returns the edge and the underlying production
725    /// resolution path is the caller's job.
726    fn take_matching(&mut self, predicate: impl Fn(&Edge) -> bool) -> Option<&'a Edge> {
727        for (i, edge) in self.edges.iter().enumerate() {
728            if !self.consumed[i] && predicate(edge) {
729                self.consumed[i] = true;
730                return Some(edge);
731            }
732        }
733        None
734    }
735}
736
737thread_local! {
738    static EMIT_DEPTH: std::cell::Cell<usize> = const { std::cell::Cell::new(0) };
739    /// Set of `(vertex_id, rule_name)` pairs that are currently being
740    /// walked by the recursion. A SYMBOL that resolves to a rule
741    /// already on this stack closes a μ-binder cycle: in the
742    /// coinductive reading, the rule walk at any vertex is the least
743    /// fixed point of `body[μ X . body / X]`, which unfolds at most
744    /// once, with the second visit returning the empty sequence (the
745    /// unit of the free token monoid). Examples that trigger this:
746    /// YAML's `stream` ⊃ `_b_blk_*` mutually-recursive chain, Rust's
747    /// `_expression` ⊃ `binary_expression` ⊃ `_expression`.
748    static EMIT_MU_FRAMES: std::cell::RefCell<std::collections::HashSet<(String, String)>> =
749        std::cell::RefCell::new(std::collections::HashSet::new());
750    /// The name of the FIELD whose body the walker is currently inside,
751    /// or `None` at top level. Lets a SYMBOL nested arbitrarily deep
752    /// in the field's content (under SEQ, CHOICE, REPEAT, OPTIONAL)
753    /// consume from the *outer* cursor by edge-kind rather than from
754    /// the child's own cursor by symbol-match. Without this, shapes
755    /// like `field('args', commaSep1($.X))` — which expands to
756    /// `FIELD(SEQ(SYMBOL X, REPEAT(SEQ(',', SYMBOL X))))` — emit only
757    /// the first matched edge: the FIELD handler consumed one edge,
758    /// the inner REPEAT searched the consumed child's cursor (which
759    /// has no more sibling field edges), and the REPEAT broke after
760    /// one iteration. Setting the context here so the inner SYMBOL
761    /// pulls successive field-named edges from the outer cursor
762    /// recovers every matched edge across arbitrary nesting.
763    static EMIT_FIELD_CONTEXT: std::cell::RefCell<Option<String>> =
764        const { std::cell::RefCell::new(None) };
765}
766
767/// RAII guard that restores the prior `EMIT_FIELD_CONTEXT` value on drop.
768struct FieldContextGuard(Option<String>);
769
770impl Drop for FieldContextGuard {
771    fn drop(&mut self) {
772        EMIT_FIELD_CONTEXT.with(|f| *f.borrow_mut() = self.0.take());
773    }
774}
775
776fn push_field_context(name: &str) -> FieldContextGuard {
777    let prev = EMIT_FIELD_CONTEXT.with(|f| f.borrow_mut().replace(name.to_owned()));
778    FieldContextGuard(prev)
779}
780
781/// Clear the field context for the duration of a child-context walk.
782/// The child's own production has its own FIELDs that set their own
783/// context; the outer field hint must not leak into them.
784fn clear_field_context() -> FieldContextGuard {
785    let prev = EMIT_FIELD_CONTEXT.with(|f| f.borrow_mut().take());
786    FieldContextGuard(prev)
787}
788
789fn current_field_context() -> Option<String> {
790    EMIT_FIELD_CONTEXT.with(|f| f.borrow().clone())
791}
792
793/// Walk a rule at a vertex inside a μ-binder. The wrapping frame is
794/// pushed before recursion and popped after, so any SYMBOL inside
795/// `rule` that re-enters the same `(vertex_id, rule_name)` pair
796/// returns the empty sequence (μ X . body unfolds once).
797fn walk_in_mu_frame(
798    protocol: &str,
799    schema: &Schema,
800    grammar: &Grammar,
801    vertex_id: &panproto_gat::Name,
802    rule_name: &str,
803    rule: &Production,
804    cursor: &mut ChildCursor<'_>,
805    out: &mut Output<'_>,
806) -> Result<(), ParseError> {
807    let key = (vertex_id.to_string(), rule_name.to_owned());
808    let inserted = EMIT_MU_FRAMES.with(|frames| frames.borrow_mut().insert(key.clone()));
809    if !inserted {
810        // We are already walking this rule at this vertex deeper in
811        // the call stack. The coinductive μ-fixed-point reading
812        // returns the empty sequence here; the surrounding
813        // production resumes after the SYMBOL.
814        return Ok(());
815    }
816    let result = emit_production(protocol, schema, grammar, vertex_id, rule, cursor, out);
817    EMIT_MU_FRAMES.with(|frames| {
818        frames.borrow_mut().remove(&key);
819    });
820    result
821}
822
823fn emit_production(
824    protocol: &str,
825    schema: &Schema,
826    grammar: &Grammar,
827    vertex_id: &panproto_gat::Name,
828    production: &Production,
829    cursor: &mut ChildCursor<'_>,
830    out: &mut Output<'_>,
831) -> Result<(), ParseError> {
832    let depth = EMIT_DEPTH.with(|d| {
833        let v = d.get() + 1;
834        d.set(v);
835        v
836    });
837    if depth > 500 {
838        EMIT_DEPTH.with(|d| d.set(d.get() - 1));
839        return Err(ParseError::EmitFailed {
840            protocol: protocol.to_owned(),
841            reason: format!(
842                "emit_production recursion >500 (likely a cyclic grammar; \
843                     vertex='{vertex_id}')"
844            ),
845        });
846    }
847    let result = emit_production_inner(
848        protocol, schema, grammar, vertex_id, production, cursor, out,
849    );
850    EMIT_DEPTH.with(|d| d.set(d.get() - 1));
851    result
852}
853
854fn emit_production_inner(
855    protocol: &str,
856    schema: &Schema,
857    grammar: &Grammar,
858    vertex_id: &panproto_gat::Name,
859    production: &Production,
860    cursor: &mut ChildCursor<'_>,
861    out: &mut Output<'_>,
862) -> Result<(), ParseError> {
863    match production {
864        Production::String { value } => {
865            out.token(value);
866            Ok(())
867        }
868        Production::Pattern { value } => {
869            if let Some(literal) = literal_value(schema, vertex_id) {
870                out.token(literal);
871            } else if is_newline_like_pattern(value) {
872                // Patterns like `\r?\n`, `\n`, `\r\n` are the structural
873                // newline tokens grammars use to separate top-level
874                // statements (csound's `_new_line`, ABC's line-end, etc.).
875                // Emitting them through the placeholder fallback rendered
876                // the bare `_` sentinel between siblings; route them to
877                // the layout pass's line-break instead so the output
878                // re-parses.
879                out.newline();
880            } else if is_whitespace_only_pattern(value) {
881                // `\s+`, `[ \t]+` and friends are interstitial whitespace
882                // tokens. Emit nothing: the layout pass inserts the
883                // policy separator between adjacent Lits if needed.
884            } else {
885                out.token(&placeholder_for_pattern(value));
886            }
887            Ok(())
888        }
889        Production::Blank => Ok(()),
890        Production::Symbol { name } => {
891            // Inside a FIELD body, a SYMBOL consumes by field-name on
892            // the outer cursor rather than searching by symbol-match.
893            // This covers the simple `FIELD(SYMBOL X)` case as well as
894            // every nesting under FIELD that contains SYMBOLs (SEQ,
895            // CHOICE, REPEAT, OPTIONAL, ALIAS). Without the override,
896            // shapes like `field('args', commaSep1($.X))` consume one
897            // field edge in the FIELD handler and then the REPEAT
898            // inside SEQ searches the consumed child's cursor — where
899            // no sibling field edges sit — and breaks after one
900            // iteration.
901            if let Some(field) = current_field_context() {
902                if let Some(edge) = cursor.take_field(&field) {
903                    return emit_in_child_context(
904                        protocol, schema, grammar, &edge.tgt, production, out,
905                    );
906                }
907                // No matching field-named edge left on the outer
908                // cursor. Surface nothing; the surrounding REPEAT /
909                // OPTIONAL / CHOICE backtracks the literal tokens it
910                // emitted on this iteration when it sees no progress.
911                return Ok(());
912            }
913            if name.starts_with('_') {
914                // Hidden rule: not a vertex kind on the schema side.
915                // Inline-expand the rule body so its children take
916                // edges from the current cursor, instead of trying to
917                // take a single child edge that "satisfies" the
918                // hidden rule and discarding the rest of the body
919                // (which would drop tokens like `=` and the trailing
920                // value SYMBOL inside e.g. TOML's `_inline_pair`).
921                //
922                // Wrapped in a μ-frame so a hidden rule that
923                // references its own kind cyclically (or another
924                // hidden rule that closes the cycle) unfolds once
925                // and then collapses to the empty sequence at the
926                // second visit, rather than blowing the stack.
927                if let Some(rule) = grammar.rules.get(name) {
928                    walk_in_mu_frame(
929                        protocol, schema, grammar, vertex_id, name, rule, cursor, out,
930                    )
931                } else {
932                    // External hidden rule (declared in the
933                    // grammar's `externals` block, scanned by C code,
934                    // not listed in `rules`). Heuristic fallback by
935                    // name:
936                    //
937                    // - `_indent` / `*_indent`: open an indent block.
938                    //   Indent-based grammars (Python, YAML, qvr)
939                    //   declare an `_indent` external scanner before
940                    //   the body of a block-bodied declaration; the
941                    //   emitted output is unparseable without the
942                    //   corresponding indentation jump.
943                    // - `_dedent` / `*_dedent`: close the matching
944                    //   indent block.
945                    // - `_newline` / `*_line_ending` / `*_or_eof`:
946                    //   universally newline-or-empty; emitting a
947                    //   single newline is the right default for
948                    //   grammars like TOML whose `pair` SEQ trails
949                    //   into `_line_ending_or_eof`.
950                    //
951                    // Anything else falls through silently — better
952                    // to drop an unknown external token than to
953                    // invent one that confuses re-parsing.
954                    if name == "_indent" || name.ends_with("_indent") {
955                        out.indent_open();
956                    } else if name == "_dedent" || name.ends_with("_dedent") {
957                        out.indent_close();
958                    } else if name.contains("line_ending")
959                        || name.contains("newline")
960                        || name.ends_with("_or_eof")
961                    {
962                        out.newline();
963                    }
964                    Ok(())
965                }
966            } else if let Some(edge) = take_symbol_match(grammar, schema, cursor, name) {
967                // For supertype / hidden-rule dispatch the child's
968                // own kind names the actual production to walk
969                // (`child.kind` IS the subtype). For ALIAS the
970                // dependent-optic context is carried by the
971                // surrounding `Production::Alias` branch, which calls
972                // `emit_aliased_child` directly; we don't reach here
973                // for that case. So walking `grammar.rules[child.kind]`
974                // via `emit_vertex` is correct: the dependent-optic
975                // path is preserved at every site where it actually
976                // diverges from `child.kind`.
977                emit_vertex(protocol, schema, grammar, &edge.tgt, out)
978            } else if vertex_id_kind(schema, vertex_id) == Some(name.as_str()) {
979                let rule = grammar
980                    .rules
981                    .get(name)
982                    .ok_or_else(|| ParseError::EmitFailed {
983                        protocol: protocol.to_owned(),
984                        reason: format!("no production for SYMBOL '{name}'"),
985                    })?;
986                // Self-reference (`X = ... SYMBOL X ...`): wrap in a
987                // μ-frame so re-entry collapses to the empty sequence.
988                walk_in_mu_frame(
989                    protocol, schema, grammar, vertex_id, name, rule, cursor, out,
990                )
991            } else {
992                // Named rule with no matching child: emit nothing and
993                // let the surrounding CHOICE / OPTIONAL / REPEAT
994                // resolve the absence.
995                Ok(())
996            }
997        }
998        Production::Seq { members } => {
999            for member in members {
1000                emit_production(protocol, schema, grammar, vertex_id, member, cursor, out)?;
1001            }
1002            Ok(())
1003        }
1004        Production::Choice { members } => {
1005            if let Some(matched) =
1006                pick_choice_with_cursor(schema, grammar, vertex_id, cursor, members)
1007            {
1008                emit_production(protocol, schema, grammar, vertex_id, matched, cursor, out)
1009            } else {
1010                Ok(())
1011            }
1012        }
1013        Production::Repeat { content } | Production::Repeat1 { content } => {
1014            let mut emitted_any = false;
1015            loop {
1016                let cursor_snap = cursor.consumed.clone();
1017                let out_snap = out.snapshot();
1018                let consumed_before = cursor.consumed.iter().filter(|&&c| c).count();
1019                let result =
1020                    emit_production(protocol, schema, grammar, vertex_id, content, cursor, out);
1021                let consumed_after = cursor.consumed.iter().filter(|&&c| c).count();
1022                if result.is_err() || consumed_after == consumed_before {
1023                    cursor.consumed = cursor_snap;
1024                    out.restore(out_snap);
1025                    break;
1026                }
1027                emitted_any = true;
1028            }
1029            if matches!(production, Production::Repeat1 { .. }) && !emitted_any {
1030                emit_production(protocol, schema, grammar, vertex_id, content, cursor, out)?;
1031            }
1032            Ok(())
1033        }
1034        Production::Optional { content } => {
1035            let cursor_snap = cursor.consumed.clone();
1036            let out_snap = out.snapshot();
1037            let consumed_before = cursor.consumed.iter().filter(|&&c| c).count();
1038            let result =
1039                emit_production(protocol, schema, grammar, vertex_id, content, cursor, out);
1040            // OPTIONAL is a backtracking site: if the inner production
1041            // errored *or* made no progress without leaving a witness
1042            // constraint, restore both cursor and output to their
1043            // pre-attempt state. Mirrors `Repeat`'s loop body.
1044            if result.is_err() {
1045                cursor.consumed = cursor_snap;
1046                out.restore(out_snap);
1047                return result;
1048            }
1049            let consumed_after = cursor.consumed.iter().filter(|&&c| c).count();
1050            if consumed_after == consumed_before
1051                && !has_relevant_constraint(content, schema, vertex_id)
1052            {
1053                cursor.consumed = cursor_snap;
1054                out.restore(out_snap);
1055            }
1056            Ok(())
1057        }
1058        Production::Field { name, content } => {
1059            // Set the field context for the duration of `content`'s
1060            // walk and emit the content against the *outer* cursor.
1061            // The SYMBOL handler picks up the context and pulls
1062            // successive `take_field(name)` edges as it encounters
1063            // SYMBOLs anywhere under `content` (under SEQ, CHOICE,
1064            // REPEAT, OPTIONAL, ALIAS — arbitrarily nested). This
1065            // subsumes the prior carve-outs for FIELD(REPEAT(...)),
1066            // FIELD(REPEAT1(...)), and the bare FIELD(SYMBOL ...)
1067            // case, and adds coverage for
1068            // `field('xs', commaSep1($.X))` which expands to
1069            // FIELD(SEQ(SYMBOL X, REPEAT(SEQ(',', SYMBOL X)))) and
1070            // any other shape where REPEAT/REPEAT1 sits inside SEQ /
1071            // CHOICE / OPTIONAL under a FIELD. A FIELD that wraps a
1072            // non-SYMBOL production (e.g. `field('op', '+')` or
1073            // `field('op', CHOICE(STRING ...))`) still works: STRING
1074            // handlers ignore the context and emit literals
1075            // directly, so the operator token survives the round
1076            // trip.
1077            let _guard = push_field_context(name);
1078            emit_production(protocol, schema, grammar, vertex_id, content, cursor, out)
1079        }
1080        Production::Alias {
1081            content,
1082            named,
1083            value,
1084        } => {
1085            // A named ALIAS rewrites the parser-visible kind to
1086            // `value`. If the cursor has an unconsumed child whose
1087            // kind matches that alias name, take it and emit the
1088            // child using the alias's INNER content as the rule
1089            // (e.g. `ALIAS { SYMBOL real_rule, value: "kind_x" }`
1090            // means a `kind_x` vertex on the schema should be walked
1091            // through `real_rule`'s body, not through whatever rule
1092            // happens to be keyed under `kind_x`). This is the
1093            // dependent-optic shape: the rule the emitter walks at a
1094            // child position is determined by the parent's chosen
1095            // alias, not by the child kind alone — without it,
1096            // grammars like YAML that introduce the same kind through
1097            // many ALIAS sites lose the parent context the moment
1098            // emit_vertex is called.
1099            if *named && !value.is_empty() {
1100                if let Some(edge) = cursor.take_matching(|edge| {
1101                    schema
1102                        .vertices
1103                        .get(&edge.tgt)
1104                        .map(|v| v.kind.as_ref() == value.as_str())
1105                        .unwrap_or(false)
1106                }) {
1107                    return emit_aliased_child(protocol, schema, grammar, &edge.tgt, content, out);
1108                }
1109            }
1110            emit_production(protocol, schema, grammar, vertex_id, content, cursor, out)
1111        }
1112        Production::Token { content }
1113        | Production::ImmediateToken { content }
1114        | Production::Prec { content, .. }
1115        | Production::PrecLeft { content, .. }
1116        | Production::PrecRight { content, .. }
1117        | Production::PrecDynamic { content, .. }
1118        | Production::Reserved { content, .. } => {
1119            emit_production(protocol, schema, grammar, vertex_id, content, cursor, out)
1120        }
1121    }
1122}
1123
1124/// Take the next cursor edge whose target vertex's kind matches the
1125/// SYMBOL `name` directly or via inline expansion of a hidden rule.
1126fn take_symbol_match<'a>(
1127    grammar: &Grammar,
1128    schema: &Schema,
1129    cursor: &mut ChildCursor<'a>,
1130    name: &str,
1131) -> Option<&'a Edge> {
1132    cursor.take_matching(|edge| {
1133        let target_kind = schema.vertices.get(&edge.tgt).map(|v| v.kind.as_ref());
1134        kind_satisfies_symbol(grammar, target_kind, name)
1135    })
1136}
1137
1138/// Decide whether a schema vertex of kind `target_kind` satisfies a
1139/// SYMBOL `name` reference in the grammar.
1140///
1141/// Operates as an O(1) lookup against the precomputed subtype
1142/// closure built at [`Grammar::from_bytes`]. The semantic content is
1143/// "K satisfies SYMBOL S iff K is reachable from S by walking the
1144/// grammar's hidden, supertype, and named-alias dispatch": this is
1145/// exactly the relation tree-sitter induces on `(parser-visible kind,
1146/// rule-position)` pairs.
1147fn kind_satisfies_symbol(grammar: &Grammar, target_kind: Option<&str>, name: &str) -> bool {
1148    let Some(target) = target_kind else {
1149        return false;
1150    };
1151    if target == name {
1152        return true;
1153    }
1154    grammar
1155        .subtypes
1156        .get(target)
1157        .is_some_and(|set| set.contains(name))
1158}
1159
1160/// Emit a child reached through an ALIAS production using the
1161/// alias's inner content as the rule, not `grammar.rules[child.kind]`.
1162///
1163/// This carries the dependent-optic context across the ALIAS edge:
1164/// at the parent rule's site we know which underlying production the
1165/// alias wraps (typically `SYMBOL real_rule`), and that's the
1166/// production that should drive the emit walk on the child's
1167/// children. Looking up `grammar.rules.get(child.kind)` instead would
1168/// either fail (the renamed kind has no top-level rule, e.g. YAML's
1169/// `block_mapping_pair`) or pick an arbitrary same-kinded rule from
1170/// elsewhere in the grammar.
1171///
1172/// Walk-context invariant. The dependent-optic shape of `emit_pretty`
1173/// says: the production walked at any vertex is determined by the
1174/// path from the root through the grammar, not by the vertex kind in
1175/// isolation. Two dispatch sites realise that invariant:
1176///
1177/// * [`emit_vertex`] looks up `grammar.rules[child.kind]` and walks
1178///   it. Correct for supertype / hidden-rule dispatch: the child's
1179///   kind on the schema IS the subtype tree-sitter selected, so its
1180///   top-level rule is the right production to walk.
1181/// * `emit_aliased_child` threads the parent rule's `Production`
1182///   directly (the inner `content` of `Production::Alias`) and walks
1183///   it on the child's children. Correct for ALIAS dispatch: the
1184///   child's kind on the schema is the alias's `value` (a renamed
1185///   kind that may have no top-level rule), and the production to
1186///   walk is the alias's content body, supplied by the parent.
1187///
1188/// Together these cover every site where the rule-walked-at-child
1189/// diverges from `grammar.rules[child.kind]`; the recursion site for
1190/// plain SYMBOL therefore correctly delegates to `emit_vertex`, and
1191/// we do not need a richer `WalkContext` value passed by reference.
1192/// The grammar dependency is the thread.
1193fn emit_aliased_child(
1194    protocol: &str,
1195    schema: &Schema,
1196    grammar: &Grammar,
1197    child_id: &panproto_gat::Name,
1198    content: &Production,
1199    out: &mut Output<'_>,
1200) -> Result<(), ParseError> {
1201    // Leaf shortcut: if the child has a literal-value and no
1202    // structural children, emit the captured text. Identifiers and
1203    // similar terminals reach here when an ALIAS wraps a SYMBOL that
1204    // resolves to a PATTERN.
1205    if let Some(literal) = literal_value(schema, child_id) {
1206        if children_for(schema, child_id).is_empty() {
1207            out.token(literal);
1208            return Ok(());
1209        }
1210    }
1211
1212    // Resolve `content` to a rule when it's a SYMBOL (the dominant
1213    // shape: `ALIAS { content: SYMBOL real_rule, value: "kind_x" }`).
1214    if let Production::Symbol { name } = content {
1215        if let Some(rule) = grammar.rules.get(name) {
1216            let edges = children_for(schema, child_id);
1217            let mut cursor = ChildCursor::new(&edges);
1218            return emit_production(protocol, schema, grammar, child_id, rule, &mut cursor, out);
1219        }
1220    }
1221
1222    // Other ALIAS contents (CHOICE, SEQ, literals) walk in place.
1223    let edges = children_for(schema, child_id);
1224    let mut cursor = ChildCursor::new(&edges);
1225    emit_production(
1226        protocol,
1227        schema,
1228        grammar,
1229        child_id,
1230        content,
1231        &mut cursor,
1232        out,
1233    )
1234}
1235
1236fn emit_in_child_context(
1237    protocol: &str,
1238    schema: &Schema,
1239    grammar: &Grammar,
1240    child_id: &panproto_gat::Name,
1241    production: &Production,
1242    out: &mut Output<'_>,
1243) -> Result<(), ParseError> {
1244    // The child walks under its own production tree, with its own
1245    // FIELDs setting their own contexts. Clear the outer FIELD hint
1246    // so it does not leak through and cause sibling SYMBOLs inside
1247    // the child's body to mistakenly pull edges from the child's
1248    // cursor by the parent's field name.
1249    let _guard = clear_field_context();
1250    // If `production` is a structural wrapper (CHOICE / SEQ /
1251    // OPTIONAL / ...) whose referenced symbols cover the child's own
1252    // kind, the child IS the production's target node and the right
1253    // emit path is `emit_vertex(child)` (which honours the
1254    // literal-value leaf shortcut). Without this guard, FIELD(pattern,
1255    // CHOICE { _pattern, self }) on an identifier child walks the
1256    // CHOICE on the identifier's empty cursor, falls through to the
1257    // first non-BLANK alt, and loses the captured identifier text.
1258    if !matches!(production, Production::Symbol { .. }) {
1259        let child_kind = schema.vertices.get(child_id).map(|v| v.kind.as_ref());
1260        let symbols = referenced_symbols(production);
1261        if symbols
1262            .iter()
1263            .any(|s| kind_satisfies_symbol(grammar, child_kind, s) || child_kind == Some(s))
1264        {
1265            return emit_vertex(protocol, schema, grammar, child_id, out);
1266        }
1267    }
1268    match production {
1269        Production::Symbol { .. } => emit_vertex(protocol, schema, grammar, child_id, out),
1270        _ => {
1271            let edges = children_for(schema, child_id);
1272            let mut cursor = ChildCursor::new(&edges);
1273            emit_production(
1274                protocol,
1275                schema,
1276                grammar,
1277                child_id,
1278                production,
1279                &mut cursor,
1280                out,
1281            )
1282        }
1283    }
1284}
1285
1286fn pick_choice_with_cursor<'a>(
1287    schema: &Schema,
1288    grammar: &Grammar,
1289    vertex_id: &panproto_gat::Name,
1290    cursor: &ChildCursor<'_>,
1291    alternatives: &'a [Production],
1292) -> Option<&'a Production> {
1293    // Discriminator-driven dispatch (highest priority): when the
1294    // walker recorded a `chose-alt-fingerprint` constraint at parse
1295    // time, dispatch directly against that. This is the categorical
1296    // discriminator: it survives stripping of byte-position
1297    // constraints (so by-construction round-trips work) and is the
1298    // explicit witness of which CHOICE alternative the parser took.
1299    //
1300    // Falls back to the live `interstitial-*` substring blob when no
1301    // fingerprint is present (e.g. instances built by callers that
1302    // bypass the AstWalker). Both blobs are scored by the longest
1303    // STRING-literal token in an alternative that matches; the
1304    // length tiebreak prefers `&&` over `&`, `==` over `=`, etc.
1305    let constraint_blob = schema
1306        .constraints
1307        .get(vertex_id)
1308        .map(|cs| {
1309            let fingerprint: Option<&str> = cs
1310                .iter()
1311                .find(|c| c.sort.as_ref() == "chose-alt-fingerprint")
1312                .map(|c| c.value.as_str());
1313            if let Some(fp) = fingerprint {
1314                fp.to_owned()
1315            } else {
1316                cs.iter()
1317                    .filter(|c| {
1318                        let s = c.sort.as_ref();
1319                        s.starts_with("interstitial-") && !s.ends_with("-start-byte")
1320                    })
1321                    .map(|c| c.value.as_str())
1322                    .collect::<Vec<&str>>()
1323                    .join(" ")
1324            }
1325        })
1326        .unwrap_or_default();
1327    let child_kinds: Vec<&str> = schema
1328        .constraints
1329        .get(vertex_id)
1330        .and_then(|cs| {
1331            cs.iter()
1332                .find(|c| c.sort.as_ref() == "chose-alt-child-kinds")
1333                .map(|c| c.value.split_whitespace().collect())
1334        })
1335        .unwrap_or_default();
1336    if !constraint_blob.is_empty() {
1337        // Primary score: literal-token match length. This dominates
1338        // alt selection so existing language tests that depend on
1339        // literal-only fingerprints keep working.
1340        // Secondary score (tiebreaker only): named-symbol kind match
1341        // count, read from the separate `chose-alt-child-kinds`
1342        // constraint (kept apart from the literal fingerprint so
1343        // identifiers like `:` in the kind list don't contaminate the
1344        // literal match). An alt that matches the recorded kinds is a
1345        // stronger witness than one whose only
1346        // overlap is literal punctuation.
1347        let mut best_literal: usize = 0;
1348        let mut best_symbols: usize = 0;
1349        let mut best_alt: Option<&Production> = None;
1350        let mut tied = false;
1351        for alt in alternatives {
1352            let strings = literal_strings(alt);
1353            if strings.is_empty() {
1354                continue;
1355            }
1356            let literal_score = strings
1357                .iter()
1358                .filter(|s| constraint_blob.contains(s.as_str()))
1359                .map(String::len)
1360                .sum::<usize>();
1361            if literal_score == 0 {
1362                continue;
1363            }
1364            // Symbol score is computed only as a tiebreaker among alts
1365            // whose literal-token coverage is the same; it never lifts
1366            // an alt above one with a strictly higher literal score.
1367            // Reads the `chose-alt-child-kinds` constraint (a separate
1368            // sequence the walker emits, kept apart from the literal
1369            // fingerprint to avoid cross-contamination).
1370            let symbol_score = if literal_score >= best_literal && !child_kinds.is_empty() {
1371                let symbols = referenced_symbols(alt);
1372                symbols
1373                    .iter()
1374                    .filter(|sym| {
1375                        let sym_str: &str = sym;
1376                        if child_kinds.contains(&sym_str) {
1377                            return true;
1378                        }
1379                        grammar.subtypes.get(sym_str).is_some_and(|sub_set| {
1380                            sub_set
1381                                .iter()
1382                                .any(|sub| child_kinds.contains(&sub.as_str()))
1383                        })
1384                    })
1385                    .count()
1386            } else {
1387                0
1388            };
1389            let better = literal_score > best_literal
1390                || (literal_score == best_literal && symbol_score > best_symbols);
1391            let same = literal_score == best_literal && symbol_score == best_symbols;
1392            if better {
1393                best_literal = literal_score;
1394                best_symbols = symbol_score;
1395                best_alt = Some(alt);
1396                tied = false;
1397            } else if same && best_alt.is_some() {
1398                tied = true;
1399            }
1400        }
1401        // Only commit to an alt when the fingerprint discriminates it
1402        // uniquely. A tie means the alts share the same literal token
1403        // set (e.g. JSON's `string = CHOICE { SEQ { '"', '"' }, SEQ {
1404        // '"', _string_content, '"' } }` — both alts contain just the
1405        // two `"` tokens). In that case fall through to cursor-based
1406        // dispatch, which uses the actual edge structure.
1407        if let Some(alt) = best_alt {
1408            if !tied {
1409                return Some(alt);
1410            }
1411        }
1412    }
1413
1414    // Cursor-driven dispatch: pick the alternative whose body
1415    // references a SYMBOL covering the *first unconsumed* edge in
1416    // cursor order. `referenced_symbols` walks the alternative
1417    // recursively (across nested SEQs, REPEATs, OPTIONALs, FIELDs,
1418    // etc.) so a leading optional like `attribute_item` does not
1419    // block matching when only the trailing required symbol is
1420    // present on the schema.
1421    //
1422    // Ordering by the first unconsumed edge (rather than picking any
1423    // alternative whose SYMBOL set intersects the unconsumed
1424    // multiset) is what preserves schema edge order under
1425    // REPEAT(CHOICE(...)) productions. Without this rule, alt order
1426    // in the grammar's CHOICE determines the emission order, and a
1427    // schema with interleaved kinds like `[symbol, punct, int,
1428    // symbol, punct, int]` re-fuses to `[symbol, symbol, punct,
1429    // punct, int, int]` when emitted then re-parsed. The fix is the
1430    // categorical reading of REPEAT-over-list (list-shaped fold)
1431    // rather than REPEAT-over-multiset (unordered fold).
1432    let first_unconsumed_kind: Option<&str> = cursor
1433        .edges
1434        .iter()
1435        .enumerate()
1436        .find(|(i, _)| !cursor.consumed[*i])
1437        .and_then(|(_, edge)| schema.vertices.get(&edge.tgt).map(|v| v.kind.as_ref()));
1438    if let Some(target_kind) = first_unconsumed_kind {
1439        for alt in alternatives {
1440            let symbols = referenced_symbols(alt);
1441            if !symbols.is_empty()
1442                && symbols
1443                    .iter()
1444                    .any(|s| kind_satisfies_symbol(grammar, Some(target_kind), s))
1445            {
1446                return Some(alt);
1447            }
1448        }
1449    }
1450
1451    // FIELD dispatch: pick an alternative whose FIELD name matches an
1452    // unconsumed edge kind.
1453    let edge_kinds: Vec<&str> = cursor
1454        .edges
1455        .iter()
1456        .enumerate()
1457        .filter(|(i, _)| !cursor.consumed[*i])
1458        .map(|(_, e)| e.kind.as_ref())
1459        .collect();
1460    for alt in alternatives {
1461        if has_field_in(alt, &edge_kinds) {
1462            return Some(alt);
1463        }
1464    }
1465
1466    // No cursor-driven match. Fall back to:
1467    //
1468    // - BLANK (the explicit empty alternative) when present, so an
1469    //   OPTIONAL-shaped CHOICE compiles to nothing.
1470    // - The first non-`BLANK` alternative as a last resort, used by
1471    //   STRING-only alternatives (keyword tokens) and other choices
1472    //   that don't reach the cursor.
1473    //
1474    // The previous "match own_kind" branch is intentionally absent:
1475    // when an alt's first SYMBOL equals the current vertex's kind, the
1476    // caller is already emitting that vertex's own rule. Recursing
1477    // into the alt would cause a self-loop in the rule walk.
1478    let _ = (schema, vertex_id);
1479    if alternatives.iter().any(|a| matches!(a, Production::Blank)) {
1480        return alternatives.iter().find(|a| matches!(a, Production::Blank));
1481    }
1482    alternatives
1483        .iter()
1484        .find(|alt| !matches!(alt, Production::Blank))
1485}
1486
1487/// Collect every literal STRING token directly inside `production`
1488/// (without descending into SYMBOLs / hidden rules). Used to score
1489/// CHOICE alternatives against the parent vertex's interstitials so
1490/// the right operator / keyword form is picked when the schema
1491/// preserves interstitial fragments from a prior parse.
1492fn literal_strings(production: &Production) -> Vec<String> {
1493    let mut out = Vec::new();
1494    fn walk(p: &Production, out: &mut Vec<String>) {
1495        match p {
1496            Production::String { value } if !value.is_empty() => {
1497                out.push(value.clone());
1498            }
1499            Production::Choice { members } | Production::Seq { members } => {
1500                for m in members {
1501                    walk(m, out);
1502                }
1503            }
1504            Production::Repeat { content }
1505            | Production::Repeat1 { content }
1506            | Production::Optional { content }
1507            | Production::Field { content, .. }
1508            | Production::Alias { content, .. }
1509            | Production::Token { content }
1510            | Production::ImmediateToken { content }
1511            | Production::Prec { content, .. }
1512            | Production::PrecLeft { content, .. }
1513            | Production::PrecRight { content, .. }
1514            | Production::PrecDynamic { content, .. }
1515            | Production::Reserved { content, .. } => walk(content, out),
1516            _ => {}
1517        }
1518    }
1519    walk(production, &mut out);
1520    out
1521}
1522
1523/// Collect every SYMBOL name reachable from `production` without
1524/// crossing into nested rules. Used by `pick_choice_with_cursor` to
1525/// rank alternatives by "any SYMBOL inside this alt matches something
1526/// on the cursor", instead of just the first SYMBOL: a leading
1527/// optional like `attribute_item` then `parameter` is otherwise
1528/// rejected when only the parameter children are present.
1529fn referenced_symbols(production: &Production) -> Vec<&str> {
1530    let mut out = Vec::new();
1531    fn walk<'a>(p: &'a Production, out: &mut Vec<&'a str>) {
1532        match p {
1533            Production::Symbol { name } => out.push(name.as_str()),
1534            Production::Choice { members } | Production::Seq { members } => {
1535                for m in members {
1536                    walk(m, out);
1537                }
1538            }
1539            Production::Alias {
1540                content,
1541                named,
1542                value,
1543            } => {
1544                // A named ALIAS produces a child vertex whose kind is
1545                // the alias `value` (e.g. `ALIAS { content: STRING "=",
1546                // value: "punctuation", named: true }` introduces a
1547                // `punctuation` child). For cursor-driven dispatch to
1548                // recognise alts that emit such children, yield the
1549                // alias value as a referenced symbol. Anonymous aliases
1550                // do not introduce a named node and only need their
1551                // inner content's symbols.
1552                if *named && !value.is_empty() {
1553                    out.push(value.as_str());
1554                }
1555                walk(content, out);
1556            }
1557            Production::Repeat { content }
1558            | Production::Repeat1 { content }
1559            | Production::Optional { content }
1560            | Production::Field { content, .. }
1561            | Production::Token { content }
1562            | Production::ImmediateToken { content }
1563            | Production::Prec { content, .. }
1564            | Production::PrecLeft { content, .. }
1565            | Production::PrecRight { content, .. }
1566            | Production::PrecDynamic { content, .. }
1567            | Production::Reserved { content, .. } => walk(content, out),
1568            _ => {}
1569        }
1570    }
1571    walk(production, &mut out);
1572    out
1573}
1574
1575#[cfg(test)]
1576fn first_symbol(production: &Production) -> Option<&str> {
1577    match production {
1578        Production::Symbol { name } => Some(name),
1579        Production::Seq { members } => members.iter().find_map(first_symbol),
1580        Production::Choice { members } => members.iter().find_map(first_symbol),
1581        Production::Repeat { content }
1582        | Production::Repeat1 { content }
1583        | Production::Optional { content }
1584        | Production::Field { content, .. }
1585        | Production::Alias { content, .. }
1586        | Production::Token { content }
1587        | Production::ImmediateToken { content }
1588        | Production::Prec { content, .. }
1589        | Production::PrecLeft { content, .. }
1590        | Production::PrecRight { content, .. }
1591        | Production::PrecDynamic { content, .. }
1592        | Production::Reserved { content, .. } => first_symbol(content),
1593        _ => None,
1594    }
1595}
1596
1597fn has_field_in(production: &Production, edge_kinds: &[&str]) -> bool {
1598    match production {
1599        Production::Field { name, .. } => edge_kinds.contains(&name.as_str()),
1600        Production::Seq { members } | Production::Choice { members } => {
1601            members.iter().any(|m| has_field_in(m, edge_kinds))
1602        }
1603        Production::Repeat { content }
1604        | Production::Repeat1 { content }
1605        | Production::Optional { content }
1606        | Production::Alias { content, .. }
1607        | Production::Token { content }
1608        | Production::ImmediateToken { content }
1609        | Production::Prec { content, .. }
1610        | Production::PrecLeft { content, .. }
1611        | Production::PrecRight { content, .. }
1612        | Production::PrecDynamic { content, .. }
1613        | Production::Reserved { content, .. } => has_field_in(content, edge_kinds),
1614        _ => false,
1615    }
1616}
1617
1618fn has_relevant_constraint(
1619    production: &Production,
1620    schema: &Schema,
1621    vertex_id: &panproto_gat::Name,
1622) -> bool {
1623    let constraints = match schema.constraints.get(vertex_id) {
1624        Some(c) => c,
1625        None => return false,
1626    };
1627    fn walk(production: &Production, constraints: &[panproto_schema::Constraint]) -> bool {
1628        match production {
1629            Production::String { value } => constraints
1630                .iter()
1631                .any(|c| c.value == *value || c.sort.as_ref() == value),
1632            Production::Field { name, content } => {
1633                constraints.iter().any(|c| c.sort.as_ref() == name) || walk(content, constraints)
1634            }
1635            Production::Seq { members } | Production::Choice { members } => {
1636                members.iter().any(|m| walk(m, constraints))
1637            }
1638            Production::Repeat { content }
1639            | Production::Repeat1 { content }
1640            | Production::Optional { content }
1641            | Production::Alias { content, .. }
1642            | Production::Token { content }
1643            | Production::ImmediateToken { content }
1644            | Production::Prec { content, .. }
1645            | Production::PrecLeft { content, .. }
1646            | Production::PrecRight { content, .. }
1647            | Production::PrecDynamic { content, .. }
1648            | Production::Reserved { content, .. } => walk(content, constraints),
1649            _ => false,
1650        }
1651    }
1652    walk(production, constraints)
1653}
1654
1655fn children_for<'a>(schema: &'a Schema, vertex_id: &panproto_gat::Name) -> Vec<&'a Edge> {
1656    // Walk `outgoing` (insertion-ordered by SchemaBuilder via SmallVec
1657    // append) rather than the unordered `edges` HashMap so abstract
1658    // schemas under REPEAT(CHOICE(...)) preserve the order their edges
1659    // were inserted in. The previous implementation walked the HashMap
1660    // and sorted lexicographically by (kind, target id), which fused
1661    // interleaved children of the same kind into runs (e.g. a sequence
1662    // [symbol, punct, int, symbol, punct, int] became [symbol, symbol,
1663    // punct, punct, int, int] after the lex sort).
1664    let Some(edges) = schema.outgoing.get(vertex_id) else {
1665        return Vec::new();
1666    };
1667
1668    // Look up the canonical Edge reference (the key in `schema.edges`)
1669    // for each entry in `outgoing`. Falls back to the SmallVec entry if
1670    // the canonical key is missing, which would indicate index drift.
1671    let mut indexed: Vec<(usize, u32, &Edge)> = edges
1672        .iter()
1673        .enumerate()
1674        .map(|(i, e)| {
1675            let canonical = schema.edges.get_key_value(e).map_or(e, |(k, _)| k);
1676            let pos = schema.orderings.get(canonical).copied().unwrap_or(u32::MAX);
1677            (i, pos, canonical)
1678        })
1679        .collect();
1680
1681    // Stable sort by (explicit-ordering, insertion-index). Edges with
1682    // an explicit `orderings` entry come first in their declared order;
1683    // the remainder fall through in insertion order.
1684    indexed.sort_by_key(|(i, pos, _)| (*pos, *i));
1685    indexed.into_iter().map(|(_, _, e)| e).collect()
1686}
1687
1688fn vertex_id_kind<'a>(schema: &'a Schema, vertex_id: &panproto_gat::Name) -> Option<&'a str> {
1689    schema.vertices.get(vertex_id).map(|v| v.kind.as_ref())
1690}
1691
1692fn literal_value<'a>(schema: &'a Schema, vertex_id: &panproto_gat::Name) -> Option<&'a str> {
1693    schema
1694        .constraints
1695        .get(vertex_id)?
1696        .iter()
1697        .find(|c| c.sort.as_ref() == "literal-value")
1698        .map(|c| c.value.as_str())
1699}
1700
1701/// True iff `pattern` matches a (possibly optional / repeated) sequence
1702/// of carriage-return and newline characters only. Examples: `\r?\n`,
1703/// `\n`, `\r\n`, `\n+`, `\r?\n+`. Distinguishes structural newline
1704/// terminals from generic whitespace and from other patterns that
1705/// happen to contain a newline escape inside a larger class.
1706fn is_newline_like_pattern(pattern: &str) -> bool {
1707    if pattern.is_empty() {
1708        return false;
1709    }
1710    let mut chars = pattern.chars();
1711    let mut saw_newline_atom = false;
1712    while let Some(c) = chars.next() {
1713        match c {
1714            '\\' => match chars.next() {
1715                Some('n' | 'r') => saw_newline_atom = true,
1716                _ => return false,
1717            },
1718            '?' | '*' | '+' => {} // quantifiers on the previous atom
1719            _ => return false,
1720        }
1721    }
1722    saw_newline_atom
1723}
1724
1725/// True iff `pattern` matches a (possibly quantified) run of generic
1726/// whitespace characters: `\s+`, `[ \t]+`, ` +`, `\s*`. Such patterns
1727/// describe interstitial spacing rather than syntactic content, so the
1728/// pretty emitter can drop them and let the layout pass insert the
1729/// configured separator.
1730fn is_whitespace_only_pattern(pattern: &str) -> bool {
1731    if pattern.is_empty() {
1732        return false;
1733    }
1734    // Strip an outer quantifier suffix.
1735    let trimmed = pattern.trim_end_matches(['?', '*', '+']);
1736    if trimmed.is_empty() {
1737        return false;
1738    }
1739    // Bare `\s` / ` ` / `\t`.
1740    if matches!(trimmed, "\\s" | " " | "\\t") {
1741        return true;
1742    }
1743    // Character class containing only whitespace atoms.
1744    if let Some(inner) = trimmed.strip_prefix('[').and_then(|s| s.strip_suffix(']')) {
1745        let mut chars = inner.chars();
1746        let mut saw_atom = false;
1747        while let Some(c) = chars.next() {
1748            match c {
1749                '\\' => match chars.next() {
1750                    Some('s' | 't' | 'r' | 'n') => saw_atom = true,
1751                    _ => return false,
1752                },
1753                ' ' | '\t' => saw_atom = true,
1754                _ => return false,
1755            }
1756        }
1757        return saw_atom;
1758    }
1759    false
1760}
1761
1762fn placeholder_for_pattern(pattern: &str) -> String {
1763    // Heuristic placeholder for unconstrained PATTERN terminals.
1764    //
1765    // First handle the "the regex IS a literal escape" cases that
1766    // tree-sitter grammars use as separators (`\n`, `\r\n`, `;`,
1767    // etc.); emitting the matching character is always preferable
1768    // to a `_x` identifier-like placeholder when the surrounding
1769    // grammar expects a separator.
1770    let simple_lit = decode_simple_pattern_literal(pattern);
1771    if let Some(lit) = simple_lit {
1772        return lit;
1773    }
1774
1775    if pattern.contains("[0-9]") || pattern.contains("\\d") {
1776        "0".into()
1777    } else if pattern.contains("[a-zA-Z_]") || pattern.contains("\\w") {
1778        "_x".into()
1779    } else if pattern.contains('"') || pattern.contains('\'') {
1780        "\"\"".into()
1781    } else {
1782        "_".into()
1783    }
1784}
1785
1786/// Decode a tree-sitter PATTERN whose regex is a simple literal
1787/// (newline, semicolon, comma, etc.) to the byte sequence it matches.
1788/// Returns `None` for patterns with character classes, alternations,
1789/// or quantifiers; the caller falls back to the heuristic placeholder.
1790fn decode_simple_pattern_literal(pattern: &str) -> Option<String> {
1791    // Skip patterns containing regex metachars that would broaden the
1792    // match beyond a single literal byte sequence.
1793    if pattern
1794        .chars()
1795        .any(|c| matches!(c, '[' | ']' | '(' | ')' | '*' | '+' | '?' | '|' | '{' | '}'))
1796    {
1797        return None;
1798    }
1799    let mut out = String::new();
1800    let mut chars = pattern.chars();
1801    while let Some(c) = chars.next() {
1802        if c == '\\' {
1803            match chars.next() {
1804                Some('n') => out.push('\n'),
1805                Some('r') => out.push('\r'),
1806                Some('t') => out.push('\t'),
1807                Some('\\') => out.push('\\'),
1808                Some('/') => out.push('/'),
1809                Some(other) => out.push(other),
1810                None => return None,
1811            }
1812        } else {
1813            out.push(c);
1814        }
1815    }
1816    Some(out)
1817}
1818
1819// ═══════════════════════════════════════════════════════════════════
1820// Token list output with Spacing algebra
1821// ═══════════════════════════════════════════════════════════════════
1822//
1823// Emit produces a free monoid over `Token`. Layout (spaces, newlines,
1824// indentation) is a homomorphism `Vec<Token> -> Vec<u8>` parameterised
1825// by `FormatPolicy`. Separating the structural output from the layout
1826// decision means each phase has one job: emit walks the grammar and
1827// pushes tokens; layout is a single fold, locally driven by adjacent
1828// pairs and a depth counter. Snapshot/restore is just `tokens.len()`.
1829
1830#[derive(Clone)]
1831enum Token {
1832    /// A user-visible terminal contributed by the grammar.
1833    Lit(String),
1834    /// `indent_open` marker emitted when a `Lit` matched the policy's
1835    /// open list. Carried as a separate token so layout can decide to
1836    /// break + indent without re-scanning.
1837    IndentOpen,
1838    /// `indent_close` marker emitted before a closer-`Lit`.
1839    IndentClose,
1840    /// "Break a line here if not already at line start" — used after
1841    /// statements/declarations and after open braces.
1842    LineBreak,
1843}
1844
1845struct Output<'a> {
1846    tokens: Vec<Token>,
1847    policy: &'a FormatPolicy,
1848}
1849
1850#[derive(Clone)]
1851struct OutputSnapshot {
1852    tokens_len: usize,
1853}
1854
1855impl<'a> Output<'a> {
1856    fn new(policy: &'a FormatPolicy) -> Self {
1857        Self {
1858            tokens: Vec::new(),
1859            policy,
1860        }
1861    }
1862
1863    fn token(&mut self, value: &str) {
1864        if value.is_empty() {
1865            return;
1866        }
1867
1868        if self.policy.indent_close.iter().any(|t| t == value) {
1869            self.tokens.push(Token::IndentClose);
1870        }
1871
1872        self.tokens.push(Token::Lit(value.to_owned()));
1873
1874        if self.policy.indent_open.iter().any(|t| t == value) {
1875            self.tokens.push(Token::IndentOpen);
1876            self.tokens.push(Token::LineBreak);
1877        } else if self.policy.line_break_after.iter().any(|t| t == value) {
1878            self.tokens.push(Token::LineBreak);
1879        }
1880    }
1881
1882    fn newline(&mut self) {
1883        self.tokens.push(Token::LineBreak);
1884    }
1885
1886    /// Open an indent scope: subsequent `LineBreak`s render at the
1887    /// new depth until a matching `indent_close` pops it. Used by the
1888    /// external-token fallback to render indent-based grammars'
1889    /// `_indent` scanner outputs.
1890    fn indent_open(&mut self) {
1891        self.tokens.push(Token::IndentOpen);
1892        self.tokens.push(Token::LineBreak);
1893    }
1894
1895    /// Close one indent scope opened by `indent_open`.
1896    fn indent_close(&mut self) {
1897        self.tokens.push(Token::IndentClose);
1898    }
1899
1900    fn snapshot(&self) -> OutputSnapshot {
1901        OutputSnapshot {
1902            tokens_len: self.tokens.len(),
1903        }
1904    }
1905
1906    fn restore(&mut self, snap: OutputSnapshot) {
1907        self.tokens.truncate(snap.tokens_len);
1908    }
1909
1910    fn finish(self) -> Vec<u8> {
1911        layout(&self.tokens, self.policy)
1912    }
1913}
1914
1915/// Fold a token list into bytes. The algebra:
1916/// * adjacent `Lit`s get a single space iff `needs_space_between(a, b)`,
1917/// * `IndentOpen` / `IndentClose` adjust a depth counter,
1918/// * `LineBreak` writes `\n` if not already at line start, then the
1919///   next `Lit` writes `indent * indent_width` spaces of indent.
1920fn layout(tokens: &[Token], policy: &FormatPolicy) -> Vec<u8> {
1921    let mut bytes = Vec::new();
1922    let mut indent: usize = 0;
1923    let mut at_line_start = true;
1924    let mut last_lit: Option<&str> = None;
1925    // True iff, at the moment `last_lit` was emitted, the cursor was at a
1926    // position where the grammar expects an operand: start of stream / line,
1927    // just after an open paren / bracket / brace, just after a separator like
1928    // `,` or `;`, or just after a binary / assignment operator. Used by
1929    // `needs_space_between` to recognise `last_lit` as a tight unary prefix
1930    // (`f(-1.0)`) rather than a spaced binary operator (`a - b`).
1931    let mut last_was_in_operand_position = true;
1932    let mut expecting_operand = true;
1933    let newline = policy.newline.as_bytes();
1934    let separator = policy.separator.as_bytes();
1935
1936    for tok in tokens {
1937        match tok {
1938            Token::IndentOpen => indent += 1,
1939            Token::IndentClose => {
1940                indent = indent.saturating_sub(1);
1941                if !at_line_start {
1942                    bytes.extend_from_slice(newline);
1943                    at_line_start = true;
1944                    expecting_operand = true;
1945                }
1946            }
1947            Token::LineBreak => {
1948                if !at_line_start {
1949                    bytes.extend_from_slice(newline);
1950                    at_line_start = true;
1951                    expecting_operand = true;
1952                }
1953            }
1954            Token::Lit(value) => {
1955                if at_line_start {
1956                    bytes.extend(std::iter::repeat_n(b' ', indent * policy.indent_width));
1957                } else if let Some(prev) = last_lit {
1958                    if needs_space_between(prev, value, last_was_in_operand_position) {
1959                        bytes.extend_from_slice(separator);
1960                    }
1961                }
1962                bytes.extend_from_slice(value.as_bytes());
1963                at_line_start = false;
1964                last_was_in_operand_position = expecting_operand;
1965                expecting_operand = leaves_operand_position(value);
1966                last_lit = Some(value.as_str());
1967            }
1968        }
1969    }
1970
1971    if !at_line_start {
1972        bytes.extend_from_slice(newline);
1973    }
1974    bytes
1975}
1976
1977/// True iff emitting `tok` leaves the cursor in a position where the
1978/// grammar expects an operand next. Operand-introducing tokens are open
1979/// punctuation, separators, and operator-like strings; operand-terminating
1980/// tokens are identifiers, literals, and closing punctuation.
1981fn leaves_operand_position(tok: &str) -> bool {
1982    if tok.is_empty() {
1983        return true;
1984    }
1985    if is_punct_open(tok) {
1986        return true;
1987    }
1988    if matches!(tok, "," | ";") {
1989        return true;
1990    }
1991    if is_punct_close(tok) {
1992        return false;
1993    }
1994    if first_is_alnum_or_underscore(tok) || last_ends_with_alnum(tok) {
1995        return false;
1996    }
1997    // Pure punctuation/operator runs (`=`, `+`, `-`, `<=`, `>>`, …) leave
1998    // the cursor expecting another operand.
1999    true
2000}
2001
2002fn needs_space_between(last: &str, next: &str, expecting_operand: bool) -> bool {
2003    if last.is_empty() || next.is_empty() {
2004        return false;
2005    }
2006    if is_punct_open(last) || is_punct_open(next) {
2007        return false;
2008    }
2009    if is_punct_close(next) {
2010        return false;
2011    }
2012    if is_punct_close(last) && is_punct_punctuation(next) {
2013        return false;
2014    }
2015    if last == "." || next == "." {
2016        return false;
2017    }
2018    // Tight unary prefix: `last` is a sign/logical-not operator emitted
2019    // where the grammar expected an operand, so it glues to `next`.
2020    // `expecting_operand` here means: just before `last` was emitted,
2021    // the cursor expected an operand, which makes `last` a unary prefix.
2022    // Examples: `f(-1.0)`, `[ -2, 3 ]`, `return -x`, `a = !flag`.
2023    if expecting_operand && is_unary_prefix_operator(last) && first_is_operand_start(next) {
2024        return false;
2025    }
2026    if last_is_word_like(last) && first_is_word_like(next) {
2027        return true;
2028    }
2029    if last_ends_with_alnum(last) && first_is_alnum_or_underscore(next) {
2030        return true;
2031    }
2032    // Adjacent operator runs: keep them apart so the lexer doesn't glue
2033    // `>` and `=` into `>=` unintentionally.
2034    true
2035}
2036
2037fn is_unary_prefix_operator(s: &str) -> bool {
2038    matches!(s, "-" | "+" | "!" | "~")
2039}
2040
2041fn first_is_operand_start(s: &str) -> bool {
2042    s.chars()
2043        .next()
2044        .map(|c| c.is_alphanumeric() || c == '_' || c == '.' || c == '(')
2045        .unwrap_or(false)
2046}
2047
2048fn is_punct_open(s: &str) -> bool {
2049    matches!(s, "(" | "[" | "{" | "\"" | "'" | "`")
2050}
2051
2052fn is_punct_close(s: &str) -> bool {
2053    matches!(s, ")" | "]" | "}" | "," | ";" | ":" | "\"" | "'" | "`")
2054}
2055
2056fn is_punct_punctuation(s: &str) -> bool {
2057    matches!(s, "," | ";" | ":" | "." | ")" | "]" | "}")
2058}
2059
2060fn last_is_word_like(s: &str) -> bool {
2061    s.chars()
2062        .next_back()
2063        .map(|c| c.is_alphanumeric() || c == '_')
2064        .unwrap_or(false)
2065}
2066
2067fn first_is_word_like(s: &str) -> bool {
2068    s.chars()
2069        .next()
2070        .map(|c| c.is_alphanumeric() || c == '_')
2071        .unwrap_or(false)
2072}
2073
2074fn last_ends_with_alnum(s: &str) -> bool {
2075    s.chars()
2076        .next_back()
2077        .map(char::is_alphanumeric)
2078        .unwrap_or(false)
2079}
2080
2081fn first_is_alnum_or_underscore(s: &str) -> bool {
2082    s.chars()
2083        .next()
2084        .map(|c| c.is_alphanumeric() || c == '_')
2085        .unwrap_or(false)
2086}
2087
2088#[cfg(test)]
2089mod tests {
2090    use super::*;
2091
2092    #[test]
2093    fn parses_simple_grammar_json() {
2094        let bytes = br#"{
2095            "name": "tiny",
2096            "rules": {
2097                "program": {
2098                    "type": "SEQ",
2099                    "members": [
2100                        {"type": "STRING", "value": "hello"},
2101                        {"type": "STRING", "value": ";"}
2102                    ]
2103                }
2104            }
2105        }"#;
2106        let g = Grammar::from_bytes("tiny", bytes).expect("valid tiny grammar");
2107        assert!(g.rules.contains_key("program"));
2108    }
2109
2110    #[test]
2111    fn output_emits_punctuation_without_leading_space() {
2112        let policy = FormatPolicy::default();
2113        let mut out = Output::new(&policy);
2114        out.token("foo");
2115        out.token("(");
2116        out.token(")");
2117        out.token(";");
2118        let bytes = out.finish();
2119        let s = std::str::from_utf8(&bytes).expect("ascii output");
2120        assert!(s.starts_with("foo();"), "got {s:?}");
2121    }
2122
2123    #[test]
2124    fn grammar_from_bytes_rejects_malformed_input() {
2125        let result = Grammar::from_bytes("malformed", b"not json");
2126        let err = result.expect_err("malformed bytes must yield Err");
2127        let msg = err.to_string();
2128        assert!(
2129            msg.contains("malformed"),
2130            "error message should name the protocol: {msg:?}"
2131        );
2132    }
2133
2134    #[test]
2135    fn output_indents_after_open_brace() {
2136        let policy = FormatPolicy::default();
2137        let mut out = Output::new(&policy);
2138        out.token("fn");
2139        out.token("foo");
2140        out.token("(");
2141        out.token(")");
2142        out.token("{");
2143        out.token("body");
2144        out.token("}");
2145        let bytes = out.finish();
2146        let s = std::str::from_utf8(&bytes).expect("ascii output");
2147        assert!(s.contains("{\n"), "newline after opening brace: {s:?}");
2148        assert!(s.contains("body"), "body inside block: {s:?}");
2149        assert!(s.ends_with("}\n"), "newline after closing brace: {s:?}");
2150    }
2151
2152    #[test]
2153    fn output_no_space_between_word_and_dot() {
2154        let policy = FormatPolicy::default();
2155        let mut out = Output::new(&policy);
2156        out.token("foo");
2157        out.token(".");
2158        out.token("bar");
2159        let bytes = out.finish();
2160        let s = std::str::from_utf8(&bytes).expect("ascii output");
2161        assert!(s.starts_with("foo.bar"), "no space around dot: {s:?}");
2162    }
2163
2164    #[test]
2165    fn output_snapshot_restore_truncates_bytes() {
2166        let policy = FormatPolicy::default();
2167        let mut out = Output::new(&policy);
2168        out.token("keep");
2169        let snap = out.snapshot();
2170        out.token("drop");
2171        out.token("more");
2172        out.restore(snap);
2173        out.token("after");
2174        let bytes = out.finish();
2175        let s = std::str::from_utf8(&bytes).expect("ascii output");
2176        assert!(s.contains("keep"), "kept token survives: {s:?}");
2177        assert!(s.contains("after"), "post-restore token visible: {s:?}");
2178        assert!(!s.contains("drop"), "rolled-back token removed: {s:?}");
2179        assert!(!s.contains("more"), "rolled-back token removed: {s:?}");
2180    }
2181
2182    #[test]
2183    fn child_cursor_take_field_consumes_once() {
2184        let edges_owned: Vec<Edge> = vec![Edge {
2185            src: panproto_gat::Name::from("p"),
2186            tgt: panproto_gat::Name::from("c"),
2187            kind: panproto_gat::Name::from("name"),
2188            name: None,
2189        }];
2190        let edges: Vec<&Edge> = edges_owned.iter().collect();
2191        let mut cursor = ChildCursor::new(&edges);
2192        let first = cursor.take_field("name");
2193        let second = cursor.take_field("name");
2194        assert!(first.is_some(), "first take returns the edge");
2195        assert!(
2196            second.is_none(),
2197            "second take returns None (already consumed)"
2198        );
2199    }
2200
2201    #[test]
2202    fn child_cursor_take_matching_predicate() {
2203        let edges_owned: Vec<Edge> = vec![
2204            Edge {
2205                src: "p".into(),
2206                tgt: "c1".into(),
2207                kind: "child_of".into(),
2208                name: None,
2209            },
2210            Edge {
2211                src: "p".into(),
2212                tgt: "c2".into(),
2213                kind: "key".into(),
2214                name: None,
2215            },
2216        ];
2217        let edges: Vec<&Edge> = edges_owned.iter().collect();
2218        let mut cursor = ChildCursor::new(&edges);
2219        assert!(cursor.has_matching(|e| e.kind.as_ref() == "key"));
2220        let taken = cursor.take_matching(|e| e.kind.as_ref() == "key");
2221        assert!(taken.is_some());
2222        assert!(
2223            !cursor.has_matching(|e| e.kind.as_ref() == "key"),
2224            "consumed edge no longer matches"
2225        );
2226        assert!(
2227            cursor.has_matching(|e| e.kind.as_ref() == "child_of"),
2228            "the other edge is still available"
2229        );
2230    }
2231
2232    #[test]
2233    fn kind_satisfies_symbol_direct_match() {
2234        let bytes = br#"{
2235            "name": "tiny",
2236            "rules": {
2237                "x": {"type": "STRING", "value": "x"}
2238            }
2239        }"#;
2240        let g = Grammar::from_bytes("tiny", bytes).expect("valid grammar");
2241        assert!(kind_satisfies_symbol(&g, Some("x"), "x"));
2242        assert!(!kind_satisfies_symbol(&g, Some("y"), "x"));
2243        assert!(!kind_satisfies_symbol(&g, None, "x"));
2244    }
2245
2246    #[test]
2247    fn kind_satisfies_symbol_through_hidden_rule() {
2248        let bytes = br#"{
2249            "name": "tiny",
2250            "rules": {
2251                "_value": {
2252                    "type": "CHOICE",
2253                    "members": [
2254                        {"type": "SYMBOL", "name": "object"},
2255                        {"type": "SYMBOL", "name": "number"}
2256                    ]
2257                },
2258                "object": {"type": "STRING", "value": "{}"},
2259                "number": {"type": "PATTERN", "value": "[0-9]+"}
2260            }
2261        }"#;
2262        let g = Grammar::from_bytes("tiny", bytes).expect("valid grammar");
2263        assert!(
2264            kind_satisfies_symbol(&g, Some("number"), "_value"),
2265            "number is reachable from _value via CHOICE"
2266        );
2267        assert!(
2268            kind_satisfies_symbol(&g, Some("object"), "_value"),
2269            "object is reachable from _value via CHOICE"
2270        );
2271        assert!(
2272            !kind_satisfies_symbol(&g, Some("string"), "_value"),
2273            "string is NOT among the alternatives"
2274        );
2275    }
2276
2277    #[test]
2278    fn first_symbol_skips_string_terminals() {
2279        let prod: Production = serde_json::from_str(
2280            r#"{
2281                "type": "SEQ",
2282                "members": [
2283                    {"type": "STRING", "value": "{"},
2284                    {"type": "SYMBOL", "name": "body"},
2285                    {"type": "STRING", "value": "}"}
2286                ]
2287            }"#,
2288        )
2289        .expect("valid SEQ");
2290        assert_eq!(first_symbol(&prod), Some("body"));
2291    }
2292
2293    #[test]
2294    fn placeholder_for_pattern_routes_by_regex_class() {
2295        assert_eq!(placeholder_for_pattern("[0-9]+"), "0");
2296        assert_eq!(placeholder_for_pattern("[a-zA-Z_]\\w*"), "_x");
2297        assert_eq!(placeholder_for_pattern("\"[^\"]*\""), "\"\"");
2298        assert_eq!(placeholder_for_pattern("\\d+\\.\\d+"), "0");
2299    }
2300
2301    #[test]
2302    fn format_policy_default_breaks_after_semicolon() {
2303        let policy = FormatPolicy::default();
2304        assert!(policy.line_break_after.iter().any(|t| t == ";"));
2305        assert!(policy.indent_open.iter().any(|t| t == "{"));
2306        assert!(policy.indent_close.iter().any(|t| t == "}"));
2307        assert_eq!(policy.indent_width, 2);
2308    }
2309
2310    #[test]
2311    fn placeholder_decodes_literal_pattern_separators() {
2312        // PATTERN regexes that match a single literal byte sequence
2313        // (newline, semicolon, comma) emit the bytes verbatim instead
2314        // of falling through to the `_` catch-all.
2315        assert_eq!(placeholder_for_pattern("\\n"), "\n");
2316        assert_eq!(placeholder_for_pattern("\\r\\n"), "\r\n");
2317        assert_eq!(placeholder_for_pattern(";"), ";");
2318        // Patterns with character classes / alternation still route
2319        // through the heuristic.
2320        assert_eq!(placeholder_for_pattern("[0-9]+"), "0");
2321        assert_eq!(placeholder_for_pattern("a|b"), "_");
2322    }
2323
2324    #[test]
2325    fn supertypes_decode_from_grammar_json_strings() {
2326        // Tree-sitter older grammars list supertypes as bare strings.
2327        let bytes = br#"{
2328            "name": "tiny",
2329            "supertypes": ["expression"],
2330            "rules": {
2331                "expression": {
2332                    "type": "CHOICE",
2333                    "members": [
2334                        {"type": "SYMBOL", "name": "binary_expression"},
2335                        {"type": "SYMBOL", "name": "identifier"}
2336                    ]
2337                },
2338                "binary_expression": {"type": "STRING", "value": "x"},
2339                "identifier": {"type": "PATTERN", "value": "[a-z]+"}
2340            }
2341        }"#;
2342        let g = Grammar::from_bytes("tiny", bytes).expect("parse");
2343        assert!(g.supertypes.contains("expression"));
2344        // identifier matches the supertype `expression`.
2345        assert!(kind_satisfies_symbol(&g, Some("identifier"), "expression"));
2346        // unrelated kinds do not.
2347        assert!(!kind_satisfies_symbol(&g, Some("string"), "expression"));
2348    }
2349
2350    #[test]
2351    fn supertypes_decode_from_grammar_json_objects() {
2352        // Recent grammars list supertypes as `{type: SYMBOL, name: ...}`
2353        // entries instead of bare strings.
2354        let bytes = br#"{
2355            "name": "tiny",
2356            "supertypes": [{"type": "SYMBOL", "name": "stmt"}],
2357            "rules": {
2358                "stmt": {
2359                    "type": "CHOICE",
2360                    "members": [
2361                        {"type": "SYMBOL", "name": "while_stmt"},
2362                        {"type": "SYMBOL", "name": "if_stmt"}
2363                    ]
2364                },
2365                "while_stmt": {"type": "STRING", "value": "while"},
2366                "if_stmt": {"type": "STRING", "value": "if"}
2367            }
2368        }"#;
2369        let g = Grammar::from_bytes("tiny", bytes).expect("parse");
2370        assert!(g.supertypes.contains("stmt"));
2371        assert!(kind_satisfies_symbol(&g, Some("while_stmt"), "stmt"));
2372    }
2373
2374    #[test]
2375    fn alias_value_matches_kind() {
2376        // A named ALIAS rewrites the parser-visible kind to `value`;
2377        // `kind_satisfies_symbol` should accept that rewritten kind
2378        // when looking up the original SYMBOL.
2379        let bytes = br#"{
2380            "name": "tiny",
2381            "rules": {
2382                "_package_identifier": {
2383                    "type": "ALIAS",
2384                    "named": true,
2385                    "value": "package_identifier",
2386                    "content": {"type": "SYMBOL", "name": "identifier"}
2387                },
2388                "identifier": {"type": "PATTERN", "value": "[a-z]+"}
2389            }
2390        }"#;
2391        let g = Grammar::from_bytes("tiny", bytes).expect("parse");
2392        assert!(kind_satisfies_symbol(
2393            &g,
2394            Some("package_identifier"),
2395            "_package_identifier"
2396        ));
2397    }
2398
2399    #[test]
2400    fn referenced_symbols_walks_nested_seq() {
2401        let prod: Production = serde_json::from_str(
2402            r#"{
2403                "type": "SEQ",
2404                "members": [
2405                    {"type": "CHOICE", "members": [
2406                        {"type": "SYMBOL", "name": "attribute_item"},
2407                        {"type": "BLANK"}
2408                    ]},
2409                    {"type": "SYMBOL", "name": "parameter"},
2410                    {"type": "REPEAT", "content": {
2411                        "type": "SEQ",
2412                        "members": [
2413                            {"type": "STRING", "value": ","},
2414                            {"type": "SYMBOL", "name": "parameter"}
2415                        ]
2416                    }}
2417                ]
2418            }"#,
2419        )
2420        .expect("seq");
2421        let symbols = referenced_symbols(&prod);
2422        assert!(symbols.contains(&"attribute_item"));
2423        assert!(symbols.contains(&"parameter"));
2424    }
2425
2426    #[test]
2427    fn literal_strings_collects_choice_members() {
2428        let prod: Production = serde_json::from_str(
2429            r#"{
2430                "type": "CHOICE",
2431                "members": [
2432                    {"type": "STRING", "value": "+"},
2433                    {"type": "STRING", "value": "-"},
2434                    {"type": "STRING", "value": "*"}
2435                ]
2436            }"#,
2437        )
2438        .expect("choice");
2439        let strings = literal_strings(&prod);
2440        assert_eq!(strings, vec!["+", "-", "*"]);
2441    }
2442
2443    /// The ocaml and javascript grammars (tree-sitter ≥ 0.25) emit a
2444    /// `RESERVED` rule kind that an earlier deserialiser rejected
2445    /// with `unknown variant "RESERVED"`. Verify both that the bare
2446    /// variant deserialises and that a `RESERVED`-wrapped grammar is
2447    /// loadable end-to-end via [`Grammar::from_bytes`].
2448    #[test]
2449    fn reserved_variant_deserialises() {
2450        let prod: Production = serde_json::from_str(
2451            r#"{
2452                "type": "RESERVED",
2453                "content": {"type": "SYMBOL", "name": "_lowercase_identifier"},
2454                "context_name": "attribute_id"
2455            }"#,
2456        )
2457        .expect("RESERVED parses");
2458        match prod {
2459            Production::Reserved { content, .. } => match *content {
2460                Production::Symbol { name } => assert_eq!(name, "_lowercase_identifier"),
2461                other => panic!("expected inner SYMBOL, got {other:?}"),
2462            },
2463            other => panic!("expected RESERVED, got {other:?}"),
2464        }
2465    }
2466
2467    #[test]
2468    fn reserved_grammar_loads_end_to_end() {
2469        let bytes = br#"{
2470            "name": "tiny_reserved",
2471            "rules": {
2472                "program": {
2473                    "type": "RESERVED",
2474                    "content": {"type": "SYMBOL", "name": "ident"},
2475                    "context_name": "keywords"
2476                },
2477                "ident": {"type": "PATTERN", "value": "[a-z]+"}
2478            }
2479        }"#;
2480        let g = Grammar::from_bytes("tiny_reserved", bytes).expect("RESERVED-using grammar loads");
2481        assert!(g.rules.contains_key("program"));
2482    }
2483
2484    #[test]
2485    fn reserved_walker_helpers_recurse_into_content() {
2486        // The walker's helpers (first_symbol, has_field_in,
2487        // referenced_symbols, ...) all need to descend through
2488        // RESERVED into its content. If they bail at RESERVED, the
2489        // `pick_choice_with_cursor` heuristic ranks the alt below
2490        // alts that DO recurse, which produces wrong emit output
2491        // even when the deserialiser doesn't crash.
2492        let prod: Production = serde_json::from_str(
2493            r#"{
2494                "type": "RESERVED",
2495                "content": {
2496                    "type": "FIELD",
2497                    "name": "lhs",
2498                    "content": {"type": "SYMBOL", "name": "expr"}
2499                },
2500                "context_name": "ctx"
2501            }"#,
2502        )
2503        .expect("nested RESERVED parses");
2504        assert_eq!(first_symbol(&prod), Some("expr"));
2505        assert!(has_field_in(&prod, &["lhs"]));
2506        let symbols = referenced_symbols(&prod);
2507        assert!(symbols.contains(&"expr"));
2508    }
2509}
panproto_parse/emit_pretty.rs

panproto_parse/
emit_pretty.rs