Skip to main content

panproto_parse/
emit_pretty.rs

1#![allow(
2    clippy::module_name_repetitions,
3    clippy::too_many_lines,
4    clippy::too_many_arguments,
5    clippy::map_unwrap_or,
6    clippy::option_if_let_else,
7    clippy::elidable_lifetime_names,
8    clippy::items_after_statements,
9    clippy::needless_pass_by_value,
10    clippy::single_match_else,
11    clippy::manual_let_else,
12    clippy::match_same_arms,
13    clippy::missing_const_for_fn,
14    clippy::single_char_pattern,
15    clippy::naive_bytecount,
16    clippy::expect_used,
17    clippy::redundant_pub_crate,
18    clippy::used_underscore_binding,
19    clippy::redundant_field_names,
20    clippy::struct_field_names,
21    clippy::redundant_else,
22    clippy::similar_names
23)]
24
25//! De-novo source emission from a by-construction schema.
26//!
27//! [`AstParser::emit`] reconstructs source from byte-position fragments
28//! that the parser stored on the schema during `parse`. That works for
29//! edit pipelines (`parse → transform → emit`) but fails for schemas
30//! built by hand (`SchemaBuilder` with no parse history): they carry
31//! no `start-byte`, no `interstitial-N`, no `literal-value`, and the
32//! reconstructor returns `Err(EmitFailed { reason: "schema has no
33//! text fragments" })`.
34//!
35//! This module renders such schemas to source bytes by walking
36//! tree-sitter's `grammar.json` production rules. For each schema
37//! vertex of kind `K`, the walker looks up `K`'s production in the
38//! grammar and emits its body in order:
39//!
40//! - `STRING` nodes contribute literal token bytes directly.
41//! - `SYMBOL` and `FIELD` nodes recurse into the schema's children,
42//!   matching by edge kind (which is the tree-sitter field name).
43//! - `SEQ` emits its members in order.
44//! - `CHOICE` picks the alternative whose head `SYMBOL` matches an
45//!   actual child kind, or whose terminals appear in the rendered
46//!   prefix; falls back to the first non-`BLANK` alternative when no
47//!   alternative matches.
48//! - `REPEAT` and `REPEAT1` emit their content once per matching
49//!   child edge in declared order.
50//! - `OPTIONAL` emits its content iff a corresponding child edge or
51//!   constraint is populated.
52//! - `PATTERN` is a regex placeholder for variable-text terminals
53//!   (identifiers, numbers, quoted strings). The walker emits a
54//!   `literal-value` constraint when present and otherwise falls
55//!   back to a placeholder derived from the regex shape.
56//! - `BLANK`, `TOKEN`, `IMMEDIATE_TOKEN`, `ALIAS`, `PREC*` are
57//!   handled transparently (the inner content is emitted; the
58//!   wrapper is dropped).
59//!
60//! Whitespace and indentation come from a `FormatPolicy` applied
61//! during emission. The default policy inserts a single space between
62//! adjacent tokens, a newline after `;` / `}` / `{`, and tracks an
63//! indent counter on `{` / `}` boundaries.
64//!
65//! Output is *syntactically valid* for any grammar that ships
66//! `grammar.json`. Idiomatic formatting (rustfmt-style spacing rules,
67//! per-language conventions) is a polish layer that lives outside
68//! this module.
69
70use std::collections::BTreeMap;
71
72use panproto_schema::{Edge, Schema};
73use serde::Deserialize;
74
75use crate::error::ParseError;
76
77// ═══════════════════════════════════════════════════════════════════
78// Grammar JSON model
79// ═══════════════════════════════════════════════════════════════════
80
81/// A single tree-sitter production rule.
82///
83/// Mirrors the shape emitted by `tree-sitter generate`: every node has
84/// a `type` discriminator that selects a structural variant. The
85/// untyped subset (`PATTERN`, `STRING`, `SYMBOL`, `BLANK`) handles
86/// terminals; the structural subset (`SEQ`, `CHOICE`, `REPEAT`,
87/// `REPEAT1`, `OPTIONAL`, `FIELD`, `ALIAS`, `TOKEN`,
88/// `IMMEDIATE_TOKEN`, `PREC*`) builds composite productions.
89#[derive(Debug, Clone, Deserialize)]
90#[serde(tag = "type")]
91pub enum Production {
92    /// Concatenation of productions.
93    #[serde(rename = "SEQ")]
94    Seq {
95        /// Ordered members; each is emitted in turn.
96        members: Vec<Self>,
97    },
98    /// Alternation between productions.
99    #[serde(rename = "CHOICE")]
100    Choice {
101        /// Alternatives; the walker picks one based on the schema's
102        /// children and constraints.
103        members: Vec<Self>,
104    },
105    /// Zero-or-more repetition.
106    #[serde(rename = "REPEAT")]
107    Repeat {
108        /// The repeated body.
109        content: Box<Self>,
110    },
111    /// One-or-more repetition.
112    #[serde(rename = "REPEAT1")]
113    Repeat1 {
114        /// The repeated body.
115        content: Box<Self>,
116    },
117    /// Optional inclusion (zero or one).
118    ///
119    /// Tree-sitter usually emits `OPTIONAL` as `CHOICE { content,
120    /// BLANK }`, but recent generator versions also emit explicit
121    /// `OPTIONAL` nodes; both shapes are accepted.
122    #[serde(rename = "OPTIONAL")]
123    Optional {
124        /// The optional body.
125        content: Box<Self>,
126    },
127    /// Reference to another rule by name.
128    #[serde(rename = "SYMBOL")]
129    Symbol {
130        /// Name of the referenced rule (matches a vertex kind on the
131        /// schema side).
132        name: String,
133    },
134    /// Literal token bytes.
135    #[serde(rename = "STRING")]
136    String {
137        /// The literal token. Emitted verbatim.
138        value: String,
139    },
140    /// Regex-matched terminal.
141    ///
142    /// At parse time this matches arbitrary bytes; at emit time the
143    /// walker substitutes a `literal-value` constraint when present
144    /// and falls back to a placeholder otherwise.
145    #[serde(rename = "PATTERN")]
146    Pattern {
147        /// The original regex.
148        value: String,
149    },
150    /// The empty production. Emits nothing.
151    #[serde(rename = "BLANK")]
152    Blank,
153    /// Named field over a content production.
154    ///
155    /// The field `name` matches an edge kind on the schema side; the
156    /// walker resolves the corresponding child vertex and recurses
157    /// into `content` with that child as context.
158    #[serde(rename = "FIELD")]
159    Field {
160        /// Field name (matches edge kind).
161        name: String,
162        /// The contents of the field.
163        content: Box<Self>,
164    },
165    /// An aliased production.
166    ///
167    /// `value` records the parser-visible kind; the walker emits
168    /// `content` and ignores the alias rename.
169    #[serde(rename = "ALIAS")]
170    Alias {
171        /// The aliased content.
172        content: Box<Self>,
173        /// Whether the alias is a named node.
174        #[serde(default)]
175        named: bool,
176        /// The alias's surface name.
177        #[serde(default)]
178        value: String,
179    },
180    /// A token wrapper.
181    ///
182    /// Tree-sitter uses `TOKEN` to mark a sub-rule as a single
183    /// lexical token; the walker emits the inner content unchanged.
184    #[serde(rename = "TOKEN")]
185    Token {
186        /// The wrapped content.
187        content: Box<Self>,
188    },
189    /// An immediate-token wrapper (no preceding whitespace).
190    ///
191    /// Treated like [`Production::Token`] for emit purposes.
192    #[serde(rename = "IMMEDIATE_TOKEN")]
193    ImmediateToken {
194        /// The wrapped content.
195        content: Box<Self>,
196    },
197    /// Precedence wrapper.
198    #[serde(rename = "PREC")]
199    Prec {
200        /// Precedence value (numeric or string). Ignored at emit time.
201        #[allow(dead_code)]
202        value: serde_json::Value,
203        /// The wrapped content.
204        content: Box<Self>,
205    },
206    /// Left-associative precedence wrapper.
207    #[serde(rename = "PREC_LEFT")]
208    PrecLeft {
209        /// Precedence value. Ignored at emit time.
210        #[allow(dead_code)]
211        value: serde_json::Value,
212        /// The wrapped content.
213        content: Box<Self>,
214    },
215    /// Right-associative precedence wrapper.
216    #[serde(rename = "PREC_RIGHT")]
217    PrecRight {
218        /// Precedence value. Ignored at emit time.
219        #[allow(dead_code)]
220        value: serde_json::Value,
221        /// The wrapped content.
222        content: Box<Self>,
223    },
224    /// Dynamic precedence wrapper.
225    #[serde(rename = "PREC_DYNAMIC")]
226    PrecDynamic {
227        /// Precedence value. Ignored at emit time.
228        #[allow(dead_code)]
229        value: serde_json::Value,
230        /// The wrapped content.
231        content: Box<Self>,
232    },
233}
234
235/// A grammar's production-rule table, deserialized from `grammar.json`.
236///
237/// Only the fields the emitter consumes are decoded; precedences,
238/// conflicts, externals, and other parser-only metadata are ignored.
239#[derive(Debug, Clone, Deserialize)]
240pub struct Grammar {
241    /// Grammar name (e.g. `"rust"`, `"typescript"`).
242    #[allow(dead_code)]
243    pub name: String,
244    /// Map from rule name (a vertex kind on the schema side) to
245    /// production. Entries are kept in lexical order so iteration
246    /// is deterministic.
247    pub rules: BTreeMap<String, Production>,
248    /// Supertypes declared in the grammar's `supertypes` field. A
249    /// supertype is a rule whose body is a `CHOICE` of `SYMBOL`
250    /// references; tree-sitter parsers report a node's kind as one
251    /// of the subtypes (e.g. `identifier`, `typed_parameter`) rather
252    /// than the supertype name (`parameter`), so the emitter needs to
253    /// know that a child kind in a subtype set should match the
254    /// supertype name when a SYMBOL references it.
255    #[serde(default, deserialize_with = "deserialize_supertypes")]
256    pub supertypes: std::collections::HashSet<String>,
257    /// Precomputed subtyping closure: `subtypes[symbol_name]` is the
258    /// set of vertex kinds that satisfy a SYMBOL `symbol_name`
259    /// reference on the schema side.
260    ///
261    /// Built once at [`Grammar::from_bytes`] time by walking each
262    /// hidden rule (`_`-prefixed), declared supertype, and named
263    /// `ALIAS { value: K, ... }` production to its leaf SYMBOLs and
264    /// recording the closure. This replaces the prior heuristic
265    /// `kind_satisfies_symbol` that walked the rule body on every
266    /// query: lookups are now O(1) and the relation is exactly the
267    /// transitive closure of "is reachable via hidden / supertype /
268    /// alias dispatch", with no over-expansion through non-hidden
269    /// non-supertype rule references.
270    #[serde(skip)]
271    pub subtypes: std::collections::HashMap<String, std::collections::HashSet<String>>,
272}
273
274fn deserialize_supertypes<'de, D>(
275    deserializer: D,
276) -> Result<std::collections::HashSet<String>, D::Error>
277where
278    D: serde::Deserializer<'de>,
279{
280    let entries: Vec<serde_json::Value> = Vec::deserialize(deserializer)?;
281    let mut out = std::collections::HashSet::new();
282    for entry in entries {
283        match entry {
284            serde_json::Value::String(s) => {
285                out.insert(s);
286            }
287            serde_json::Value::Object(map) => {
288                if let Some(serde_json::Value::String(name)) = map.get("name") {
289                    out.insert(name.clone());
290                }
291            }
292            _ => {}
293        }
294    }
295    Ok(out)
296}
297
298impl Grammar {
299    /// Parse a grammar's `grammar.json` bytes.
300    ///
301    /// Builds the subtyping closure as part of construction so every
302    /// downstream lookup is O(1). The closure is the least relation
303    /// containing `(K, K)` for every rule key `K` and closed under:
304    ///
305    /// - hidden-rule expansion: if `S` is hidden and a SYMBOL `S` may
306    ///   reach SYMBOL `K`, then `K ⊑ S`.
307    /// - supertype expansion: if `S` is in the grammar's supertypes
308    ///   block and `K` is one of `S`'s alternatives, then `K ⊑ S`.
309    /// - alias renaming: if a rule body contains
310    ///   `ALIAS { content: SYMBOL R, value: A, named: true }` where
311    ///   `R` reaches kind `K` (or `K = R` when no further hop), then
312    ///   `A ⊑ R` and `K ⊑ A`.
313    ///
314    /// # Errors
315    ///
316    /// Returns [`ParseError::EmitFailed`] when the bytes are not a
317    /// valid `grammar.json` document.
318    pub fn from_bytes(protocol: &str, bytes: &[u8]) -> Result<Self, ParseError> {
319        let mut grammar: Self =
320            serde_json::from_slice(bytes).map_err(|e| ParseError::EmitFailed {
321                protocol: protocol.to_owned(),
322                reason: format!("grammar.json deserialization failed: {e}"),
323            })?;
324        grammar.subtypes = compute_subtype_closure(&grammar);
325        Ok(grammar)
326    }
327}
328
329/// Compute the subtyping relation as a forward-indexed map from a
330/// SYMBOL name to the set of vertex kinds that satisfy that SYMBOL.
331fn compute_subtype_closure(
332    grammar: &Grammar,
333) -> std::collections::HashMap<String, std::collections::HashSet<String>> {
334    use std::collections::{HashMap, HashSet};
335    // Edges of the "kind X satisfies SYMBOL Y" relation. `K ⊑ Y` is
336    // recorded whenever Y is reached by walking the grammar's
337    // ALIAS / hidden-rule / supertype dispatch from a position where
338    // K is the actual vertex kind.
339    let mut subtypes: HashMap<String, HashSet<String>> = HashMap::new();
340    for name in grammar.rules.keys() {
341        subtypes
342            .entry(name.clone())
343            .or_default()
344            .insert(name.clone());
345    }
346
347    // First pass: collect the immediate "satisfies" edges from each
348    // expandable rule (hidden, supertype) to the kinds reachable by
349    // walking its body, plus alias edges.
350    fn walk<'g>(
351        grammar: &'g Grammar,
352        production: &'g Production,
353        visited: &mut HashSet<&'g str>,
354        out: &mut HashSet<String>,
355    ) {
356        match production {
357            Production::Symbol { name } => {
358                // Direct subtype.
359                out.insert(name.clone());
360                // Continue expansion through hidden / supertype rules
361                // so the closure traverses pass-through dispatch.
362                let expand = name.starts_with('_') || grammar.supertypes.contains(name.as_str());
363                if expand && visited.insert(name.as_str()) {
364                    if let Some(rule) = grammar.rules.get(name) {
365                        walk(grammar, rule, visited, out);
366                    }
367                }
368            }
369            Production::Choice { members } | Production::Seq { members } => {
370                for m in members {
371                    walk(grammar, m, visited, out);
372                }
373            }
374            Production::Alias {
375                content,
376                named,
377                value,
378            } => {
379                if *named && !value.is_empty() {
380                    out.insert(value.clone());
381                }
382                walk(grammar, content, visited, out);
383            }
384            Production::Repeat { content }
385            | Production::Repeat1 { content }
386            | Production::Optional { content }
387            | Production::Field { content, .. }
388            | Production::Token { content }
389            | Production::ImmediateToken { content }
390            | Production::Prec { content, .. }
391            | Production::PrecLeft { content, .. }
392            | Production::PrecRight { content, .. }
393            | Production::PrecDynamic { content, .. } => {
394                walk(grammar, content, visited, out);
395            }
396            _ => {}
397        }
398    }
399
400    for (name, rule) in &grammar.rules {
401        let expand = name.starts_with('_') || grammar.supertypes.contains(name.as_str());
402        if !expand {
403            continue;
404        }
405        let mut visited: HashSet<&str> = HashSet::new();
406        visited.insert(name.as_str());
407        let mut reachable: HashSet<String> = HashSet::new();
408        walk(grammar, rule, &mut visited, &mut reachable);
409        for kind in &reachable {
410            subtypes
411                .entry(kind.clone())
412                .or_default()
413                .insert(name.clone());
414        }
415    }
416
417    // Aliases: scan every rule body for ALIAS { content, value }
418    // declarations. The kinds reachable from `content` satisfy
419    // `value`, AND (by construction) `value` satisfies the
420    // surrounding rule. Walking the ENTIRE grammar once captures
421    // every alias site, irrespective of which rule introduces it.
422    fn collect_aliases<'g>(production: &'g Production, out: &mut Vec<(String, &'g Production)>) {
423        match production {
424            Production::Alias {
425                content,
426                named,
427                value,
428            } => {
429                if *named && !value.is_empty() {
430                    out.push((value.clone(), content.as_ref()));
431                }
432                collect_aliases(content, out);
433            }
434            Production::Choice { members } | Production::Seq { members } => {
435                for m in members {
436                    collect_aliases(m, out);
437                }
438            }
439            Production::Repeat { content }
440            | Production::Repeat1 { content }
441            | Production::Optional { content }
442            | Production::Field { content, .. }
443            | Production::Token { content }
444            | Production::ImmediateToken { content }
445            | Production::Prec { content, .. }
446            | Production::PrecLeft { content, .. }
447            | Production::PrecRight { content, .. }
448            | Production::PrecDynamic { content, .. } => {
449                collect_aliases(content, out);
450            }
451            _ => {}
452        }
453    }
454    let mut aliases: Vec<(String, &Production)> = Vec::new();
455    for rule in grammar.rules.values() {
456        collect_aliases(rule, &mut aliases);
457    }
458    for (alias_value, content) in aliases {
459        let mut visited: HashSet<&str> = HashSet::new();
460        let mut reachable: HashSet<String> = HashSet::new();
461        walk(grammar, content, &mut visited, &mut reachable);
462        // Aliased value satisfies itself and is satisfied by every
463        // kind its content can reach.
464        subtypes
465            .entry(alias_value.clone())
466            .or_default()
467            .insert(alias_value.clone());
468        for kind in reachable {
469            subtypes
470                .entry(kind)
471                .or_default()
472                .insert(alias_value.clone());
473        }
474    }
475
476    // Transitive close: `K ⊑ A` and `A ⊑ B` implies `K ⊑ B`. Iterate
477    // a few rounds; the relation is small so a quick fixed-point
478    // suffices in practice.
479    for _ in 0..8 {
480        let snapshot = subtypes.clone();
481        let mut changed = false;
482        for (kind, supers) in &snapshot {
483            let extra: HashSet<String> = supers
484                .iter()
485                .flat_map(|s| snapshot.get(s).cloned().unwrap_or_default())
486                .collect();
487            let entry = subtypes.entry(kind.clone()).or_default();
488            for s in extra {
489                if entry.insert(s) {
490                    changed = true;
491                }
492            }
493        }
494        if !changed {
495            break;
496        }
497    }
498
499    subtypes
500}
501
502// ═══════════════════════════════════════════════════════════════════
503// Format policy
504// ═══════════════════════════════════════════════════════════════════
505
506/// Whitespace and indentation policy applied during emission.
507///
508/// The default policy inserts a single space between adjacent tokens,
509/// a newline after `;` / `}` / `{`, and tracks indent on `{` / `}`
510/// boundaries. Per-language overrides (idiomatic indent width,
511/// trailing-comma rules, blank-line conventions) can ride alongside
512/// this struct in a follow-up branch; today's defaults aim only for
513/// syntactic validity.
514#[derive(Debug, Clone)]
515pub struct FormatPolicy {
516    /// Number of spaces per indent level.
517    pub indent_width: usize,
518    /// Tokens after which the walker breaks to a new line.
519    pub line_break_after: Vec<String>,
520    /// Tokens that increase indent on emission.
521    pub indent_open: Vec<String>,
522    /// Tokens that decrease indent on emission.
523    pub indent_close: Vec<String>,
524}
525
526impl Default for FormatPolicy {
527    fn default() -> Self {
528        Self {
529            indent_width: 2,
530            line_break_after: vec![";".into(), "{".into(), "}".into()],
531            indent_open: vec!["{".into()],
532            indent_close: vec!["}".into()],
533        }
534    }
535}
536
537// ═══════════════════════════════════════════════════════════════════
538// Emitter
539// ═══════════════════════════════════════════════════════════════════
540
541/// Emit a by-construction schema to source bytes.
542///
543/// `protocol` is the grammar / language name (used in error messages
544/// and to label the entry point).
545///
546/// The walker treats `schema.entries` as the ordered list of root
547/// vertices, falling back to a deterministic by-id ordering when
548/// `entries` is empty. Each root is emitted using the production
549/// associated with its kind in `grammar.rules`.
550///
551/// # Errors
552///
553/// Returns [`ParseError::EmitFailed`] when:
554///
555/// - the schema has no vertices
556/// - a root vertex's kind is not a grammar rule
557/// - a `SYMBOL` reference points at a kind with no rule and no schema
558///   child to resolve it to
559/// - a required `FIELD` has no corresponding edge in the schema
560pub fn emit_pretty(
561    protocol: &str,
562    schema: &Schema,
563    grammar: &Grammar,
564    policy: &FormatPolicy,
565) -> Result<Vec<u8>, ParseError> {
566    let roots = collect_roots(schema);
567    if roots.is_empty() {
568        return Err(ParseError::EmitFailed {
569            protocol: protocol.to_owned(),
570            reason: "schema has no entry vertices".to_owned(),
571        });
572    }
573
574    let mut out = Output::new(policy);
575    for (i, root) in roots.iter().enumerate() {
576        if i > 0 {
577            out.newline();
578        }
579        emit_vertex(protocol, schema, grammar, root, &mut out)?;
580    }
581    Ok(out.finish())
582}
583
584fn collect_roots(schema: &Schema) -> Vec<&panproto_gat::Name> {
585    if !schema.entries.is_empty() {
586        return schema
587            .entries
588            .iter()
589            .filter(|name| schema.vertices.contains_key(*name))
590            .collect();
591    }
592
593    // Fallback: every vertex that is not the target of any structural edge
594    // (sorted by id for determinism).
595    let mut targets: std::collections::HashSet<&panproto_gat::Name> =
596        std::collections::HashSet::new();
597    for edge in schema.edges.keys() {
598        targets.insert(&edge.tgt);
599    }
600    let mut roots: Vec<&panproto_gat::Name> = schema
601        .vertices
602        .keys()
603        .filter(|name| !targets.contains(name))
604        .collect();
605    roots.sort();
606    roots
607}
608
609fn emit_vertex(
610    protocol: &str,
611    schema: &Schema,
612    grammar: &Grammar,
613    vertex_id: &panproto_gat::Name,
614    out: &mut Output<'_>,
615) -> Result<(), ParseError> {
616    let vertex = schema
617        .vertices
618        .get(vertex_id)
619        .ok_or_else(|| ParseError::EmitFailed {
620            protocol: protocol.to_owned(),
621            reason: format!("vertex '{vertex_id}' not found"),
622        })?;
623
624    // Leaf shortcut: a vertex carrying a `literal-value` constraint
625    // and no outgoing structural edges is a terminal token. Emit the
626    // captured value directly. This handles identifiers, numeric
627    // literals, and string literals that the parser stored as
628    // `literal-value` even on by-construction schemas.
629    if let Some(literal) = literal_value(schema, vertex_id) {
630        if children_for(schema, vertex_id).is_empty() {
631            out.token(literal);
632            return Ok(());
633        }
634    }
635
636    let kind = vertex.kind.as_ref();
637    let edges = children_for(schema, vertex_id);
638    if let Some(rule) = grammar.rules.get(kind) {
639        let mut cursor = ChildCursor::new(&edges);
640        return emit_production(protocol, schema, grammar, vertex_id, rule, &mut cursor, out);
641    }
642
643    // No rule for this kind. The parser produced it via an ALIAS
644    // (tree-sitter's `alias($.something, $.actual_kind)`) or via an
645    // external scanner (e.g. YAML's `document` root). Fall back to
646    // walking the children directly so the inner content survives;
647    // surrounding tokens — whose only source is the missing rule —
648    // are necessarily absent.
649    for edge in &edges {
650        emit_vertex(protocol, schema, grammar, &edge.tgt, out)?;
651    }
652    Ok(())
653}
654
655/// Linear cursor over a vertex's outgoing edges, used to thread
656/// children through a production rule without double-consuming them.
657struct ChildCursor<'a> {
658    edges: &'a [&'a Edge],
659    consumed: Vec<bool>,
660}
661
662impl<'a> ChildCursor<'a> {
663    fn new(edges: &'a [&'a Edge]) -> Self {
664        Self {
665            edges,
666            consumed: vec![false; edges.len()],
667        }
668    }
669
670    /// Take the next unconsumed edge whose kind equals `field_name`.
671    fn take_field(&mut self, field_name: &str) -> Option<&'a Edge> {
672        for (i, edge) in self.edges.iter().enumerate() {
673            if !self.consumed[i] && edge.kind.as_ref() == field_name {
674                self.consumed[i] = true;
675                return Some(edge);
676            }
677        }
678        None
679    }
680
681    /// Take the next unconsumed edge whose target vertex satisfies
682    /// `predicate`. Returns the edge and the underlying production
683    /// resolution path is the caller's job.
684    fn take_matching(&mut self, predicate: impl Fn(&Edge) -> bool) -> Option<&'a Edge> {
685        for (i, edge) in self.edges.iter().enumerate() {
686            if !self.consumed[i] && predicate(edge) {
687                self.consumed[i] = true;
688                return Some(edge);
689            }
690        }
691        None
692    }
693
694    /// Whether any unconsumed edge satisfies `predicate`.
695    fn has_matching(&self, predicate: impl Fn(&Edge) -> bool) -> bool {
696        self.edges
697            .iter()
698            .enumerate()
699            .any(|(i, edge)| !self.consumed[i] && predicate(edge))
700    }
701}
702
703thread_local! {
704    static EMIT_DEPTH: std::cell::Cell<usize> = const { std::cell::Cell::new(0) };
705    /// Set of `(vertex_id, rule_name)` pairs that are currently being
706    /// walked by the recursion. A SYMBOL that resolves to a rule
707    /// already on this stack closes a μ-binder cycle: in the
708    /// coinductive reading, the rule walk at any vertex is the least
709    /// fixed point of `body[μ X . body / X]`, which unfolds at most
710    /// once, with the second visit returning the empty sequence (the
711    /// unit of the free token monoid). Examples that trigger this:
712    /// YAML's `stream` ⊃ `_b_blk_*` mutually-recursive chain, Rust's
713    /// `_expression` ⊃ `binary_expression` ⊃ `_expression`.
714    static EMIT_MU_FRAMES: std::cell::RefCell<std::collections::HashSet<(String, String)>> =
715        std::cell::RefCell::new(std::collections::HashSet::new());
716}
717
718/// Walk a rule at a vertex inside a μ-binder. The wrapping frame is
719/// pushed before recursion and popped after, so any SYMBOL inside
720/// `rule` that re-enters the same `(vertex_id, rule_name)` pair
721/// returns the empty sequence (μ X . body unfolds once).
722fn walk_in_mu_frame(
723    protocol: &str,
724    schema: &Schema,
725    grammar: &Grammar,
726    vertex_id: &panproto_gat::Name,
727    rule_name: &str,
728    rule: &Production,
729    cursor: &mut ChildCursor<'_>,
730    out: &mut Output<'_>,
731) -> Result<(), ParseError> {
732    let key = (vertex_id.to_string(), rule_name.to_owned());
733    let inserted = EMIT_MU_FRAMES.with(|frames| frames.borrow_mut().insert(key.clone()));
734    if !inserted {
735        // We are already walking this rule at this vertex deeper in
736        // the call stack. The coinductive μ-fixed-point reading
737        // returns the empty sequence here; the surrounding
738        // production resumes after the SYMBOL.
739        return Ok(());
740    }
741    let result = emit_production(protocol, schema, grammar, vertex_id, rule, cursor, out);
742    EMIT_MU_FRAMES.with(|frames| {
743        frames.borrow_mut().remove(&key);
744    });
745    result
746}
747
748fn emit_production(
749    protocol: &str,
750    schema: &Schema,
751    grammar: &Grammar,
752    vertex_id: &panproto_gat::Name,
753    production: &Production,
754    cursor: &mut ChildCursor<'_>,
755    out: &mut Output<'_>,
756) -> Result<(), ParseError> {
757    let depth = EMIT_DEPTH.with(|d| {
758        let v = d.get() + 1;
759        d.set(v);
760        v
761    });
762    if depth > 500 {
763        EMIT_DEPTH.with(|d| d.set(d.get() - 1));
764        return Err(ParseError::EmitFailed {
765            protocol: protocol.to_owned(),
766            reason: format!(
767                "emit_production recursion >500 (likely a cyclic grammar; \
768                     vertex='{vertex_id}')"
769            ),
770        });
771    }
772    let result = emit_production_inner(
773        protocol, schema, grammar, vertex_id, production, cursor, out,
774    );
775    EMIT_DEPTH.with(|d| d.set(d.get() - 1));
776    result
777}
778
779fn emit_production_inner(
780    protocol: &str,
781    schema: &Schema,
782    grammar: &Grammar,
783    vertex_id: &panproto_gat::Name,
784    production: &Production,
785    cursor: &mut ChildCursor<'_>,
786    out: &mut Output<'_>,
787) -> Result<(), ParseError> {
788    match production {
789        Production::String { value } => {
790            out.token(value);
791            Ok(())
792        }
793        Production::Pattern { value } => {
794            if let Some(literal) = literal_value(schema, vertex_id) {
795                out.token(literal);
796            } else {
797                out.token(&placeholder_for_pattern(value));
798            }
799            Ok(())
800        }
801        Production::Blank => Ok(()),
802        Production::Symbol { name } => {
803            if name.starts_with('_') {
804                // Hidden rule: not a vertex kind on the schema side.
805                // Inline-expand the rule body so its children take
806                // edges from the current cursor, instead of trying to
807                // take a single child edge that "satisfies" the
808                // hidden rule and discarding the rest of the body
809                // (which would drop tokens like `=` and the trailing
810                // value SYMBOL inside e.g. TOML's `_inline_pair`).
811                //
812                // Wrapped in a μ-frame so a hidden rule that
813                // references its own kind cyclically (or another
814                // hidden rule that closes the cycle) unfolds once
815                // and then collapses to the empty sequence at the
816                // second visit, rather than blowing the stack.
817                if let Some(rule) = grammar.rules.get(name) {
818                    walk_in_mu_frame(
819                        protocol, schema, grammar, vertex_id, name, rule, cursor, out,
820                    )
821                } else {
822                    // External hidden rule (declared in the
823                    // grammar's `externals` block, scanned by C code,
824                    // not listed in `rules`). Heuristic fallback:
825                    // line-ending / EOF externals are universally
826                    // newline-or-empty, so emitting a single newline
827                    // is the right default for grammars like TOML
828                    // whose `pair` SEQ trails into
829                    // `_line_ending_or_eof`. Anything else falls
830                    // through silently.
831                    if name.contains("line_ending")
832                        || name.contains("newline")
833                        || name.ends_with("_or_eof")
834                    {
835                        out.newline();
836                    }
837                    Ok(())
838                }
839            } else if let Some(edge) = take_symbol_match(grammar, schema, cursor, name) {
840                // For supertype / hidden-rule dispatch the child's
841                // own kind names the actual production to walk
842                // (`child.kind` IS the subtype). For ALIAS the
843                // dependent-optic context is carried by the
844                // surrounding `Production::Alias` branch, which calls
845                // `emit_aliased_child` directly; we don't reach here
846                // for that case. So walking `grammar.rules[child.kind]`
847                // via `emit_vertex` is correct: the dependent-optic
848                // path is preserved at every site where it actually
849                // diverges from `child.kind`.
850                emit_vertex(protocol, schema, grammar, &edge.tgt, out)
851            } else if vertex_id_kind(schema, vertex_id) == Some(name.as_str()) {
852                let rule = grammar
853                    .rules
854                    .get(name)
855                    .ok_or_else(|| ParseError::EmitFailed {
856                        protocol: protocol.to_owned(),
857                        reason: format!("no production for SYMBOL '{name}'"),
858                    })?;
859                // Self-reference (`X = ... SYMBOL X ...`): wrap in a
860                // μ-frame so re-entry collapses to the empty sequence.
861                walk_in_mu_frame(
862                    protocol, schema, grammar, vertex_id, name, rule, cursor, out,
863                )
864            } else {
865                // Named rule with no matching child: emit nothing and
866                // let the surrounding CHOICE / OPTIONAL / REPEAT
867                // resolve the absence.
868                Ok(())
869            }
870        }
871        Production::Seq { members } => {
872            for member in members {
873                emit_production(protocol, schema, grammar, vertex_id, member, cursor, out)?;
874            }
875            Ok(())
876        }
877        Production::Choice { members } => {
878            if let Some(matched) =
879                pick_choice_with_cursor(schema, grammar, vertex_id, cursor, members)
880            {
881                emit_production(protocol, schema, grammar, vertex_id, matched, cursor, out)
882            } else {
883                Ok(())
884            }
885        }
886        Production::Repeat { content } | Production::Repeat1 { content } => {
887            let mut emitted_any = false;
888            loop {
889                let cursor_snap = cursor.consumed.clone();
890                let out_snap = out.snapshot();
891                let consumed_before = cursor.consumed.iter().filter(|&&c| c).count();
892                let result =
893                    emit_production(protocol, schema, grammar, vertex_id, content, cursor, out);
894                let consumed_after = cursor.consumed.iter().filter(|&&c| c).count();
895                if result.is_err() || consumed_after == consumed_before {
896                    cursor.consumed = cursor_snap;
897                    out.restore(out_snap);
898                    break;
899                }
900                emitted_any = true;
901            }
902            if matches!(production, Production::Repeat1 { .. }) && !emitted_any {
903                emit_production(protocol, schema, grammar, vertex_id, content, cursor, out)?;
904            }
905            Ok(())
906        }
907        Production::Optional { content } => {
908            let cursor_snap = cursor.consumed.clone();
909            let out_snap = out.snapshot();
910            let consumed_before = cursor.consumed.iter().filter(|&&c| c).count();
911            let result =
912                emit_production(protocol, schema, grammar, vertex_id, content, cursor, out);
913            // OPTIONAL is a backtracking site: if the inner production
914            // errored *or* made no progress without leaving a witness
915            // constraint, restore both cursor and output to their
916            // pre-attempt state. Mirrors `Repeat`'s loop body.
917            if result.is_err() {
918                cursor.consumed = cursor_snap;
919                out.restore(out_snap);
920                return result;
921            }
922            let consumed_after = cursor.consumed.iter().filter(|&&c| c).count();
923            if consumed_after == consumed_before
924                && !has_relevant_constraint(content, schema, vertex_id)
925            {
926                cursor.consumed = cursor_snap;
927                out.restore(out_snap);
928            }
929            Ok(())
930        }
931        Production::Field { name, content } => {
932            if let Some(edge) = cursor.take_field(name) {
933                emit_in_child_context(protocol, schema, grammar, &edge.tgt, content, out)
934            } else if first_symbol(content).is_none() {
935                // FIELD wraps a non-child production (e.g. a literal
936                // STRING operator like `+` in a binary_expression, or
937                // a CHOICE of STRING tokens). The walker captures
938                // these as interstitials rather than vertices, so the
939                // schema has no field edge to consume; emit the
940                // content in place so the operator / keyword survives
941                // the round-trip.
942                emit_production(protocol, schema, grammar, vertex_id, content, cursor, out)
943            } else {
944                Ok(())
945            }
946        }
947        Production::Alias {
948            content,
949            named,
950            value,
951        } => {
952            // A named ALIAS rewrites the parser-visible kind to
953            // `value`. If the cursor has an unconsumed child whose
954            // kind matches that alias name, take it and emit the
955            // child using the alias's INNER content as the rule
956            // (e.g. `ALIAS { SYMBOL real_rule, value: "kind_x" }`
957            // means a `kind_x` vertex on the schema should be walked
958            // through `real_rule`'s body, not through whatever rule
959            // happens to be keyed under `kind_x`). This is the
960            // dependent-optic shape: the rule the emitter walks at a
961            // child position is determined by the parent's chosen
962            // alias, not by the child kind alone — without it,
963            // grammars like YAML that introduce the same kind through
964            // many ALIAS sites lose the parent context the moment
965            // emit_vertex is called.
966            if *named && !value.is_empty() {
967                if let Some(edge) = cursor.take_matching(|edge| {
968                    schema
969                        .vertices
970                        .get(&edge.tgt)
971                        .map(|v| v.kind.as_ref() == value.as_str())
972                        .unwrap_or(false)
973                }) {
974                    return emit_aliased_child(protocol, schema, grammar, &edge.tgt, content, out);
975                }
976            }
977            emit_production(protocol, schema, grammar, vertex_id, content, cursor, out)
978        }
979        Production::Token { content }
980        | Production::ImmediateToken { content }
981        | Production::Prec { content, .. }
982        | Production::PrecLeft { content, .. }
983        | Production::PrecRight { content, .. }
984        | Production::PrecDynamic { content, .. } => {
985            emit_production(protocol, schema, grammar, vertex_id, content, cursor, out)
986        }
987    }
988}
989
990/// Take the next cursor edge whose target vertex's kind matches the
991/// SYMBOL `name` directly or via inline expansion of a hidden rule.
992fn take_symbol_match<'a>(
993    grammar: &Grammar,
994    schema: &Schema,
995    cursor: &mut ChildCursor<'a>,
996    name: &str,
997) -> Option<&'a Edge> {
998    cursor.take_matching(|edge| {
999        let target_kind = schema.vertices.get(&edge.tgt).map(|v| v.kind.as_ref());
1000        kind_satisfies_symbol(grammar, target_kind, name)
1001    })
1002}
1003
1004/// Decide whether a schema vertex of kind `target_kind` satisfies a
1005/// SYMBOL `name` reference in the grammar.
1006///
1007/// Operates as an O(1) lookup against the precomputed subtype
1008/// closure built at [`Grammar::from_bytes`]. The semantic content is
1009/// "K satisfies SYMBOL S iff K is reachable from S by walking the
1010/// grammar's hidden, supertype, and named-alias dispatch": this is
1011/// exactly the relation tree-sitter induces on `(parser-visible kind,
1012/// rule-position)` pairs.
1013fn kind_satisfies_symbol(grammar: &Grammar, target_kind: Option<&str>, name: &str) -> bool {
1014    let Some(target) = target_kind else {
1015        return false;
1016    };
1017    if target == name {
1018        return true;
1019    }
1020    grammar
1021        .subtypes
1022        .get(target)
1023        .is_some_and(|set| set.contains(name))
1024}
1025
1026/// Emit a child reached through an ALIAS production using the
1027/// alias's inner content as the rule, not `grammar.rules[child.kind]`.
1028///
1029/// This carries the dependent-optic context across the ALIAS edge:
1030/// at the parent rule's site we know which underlying production the
1031/// alias wraps (typically `SYMBOL real_rule`), and that's the
1032/// production that should drive the emit walk on the child's
1033/// children. Looking up `grammar.rules.get(child.kind)` instead would
1034/// either fail (the renamed kind has no top-level rule, e.g. YAML's
1035/// `block_mapping_pair`) or pick an arbitrary same-kinded rule from
1036/// elsewhere in the grammar.
1037///
1038/// Walk-context invariant. The dependent-optic shape of `emit_pretty`
1039/// says: the production walked at any vertex is determined by the
1040/// path from the root through the grammar, not by the vertex kind in
1041/// isolation. Two dispatch sites realise that invariant:
1042///
1043/// * [`emit_vertex`] looks up `grammar.rules[child.kind]` and walks
1044///   it. Correct for supertype / hidden-rule dispatch: the child's
1045///   kind on the schema IS the subtype tree-sitter selected, so its
1046///   top-level rule is the right production to walk.
1047/// * `emit_aliased_child` threads the parent rule's `Production`
1048///   directly (the inner `content` of `Production::Alias`) and walks
1049///   it on the child's children. Correct for ALIAS dispatch: the
1050///   child's kind on the schema is the alias's `value` (a renamed
1051///   kind that may have no top-level rule), and the production to
1052///   walk is the alias's content body, supplied by the parent.
1053///
1054/// Together these cover every site where the rule-walked-at-child
1055/// diverges from `grammar.rules[child.kind]`; the recursion site for
1056/// plain SYMBOL therefore correctly delegates to `emit_vertex`, and
1057/// we do not need a richer `WalkContext` value passed by reference.
1058/// The grammar dependency is the thread.
1059fn emit_aliased_child(
1060    protocol: &str,
1061    schema: &Schema,
1062    grammar: &Grammar,
1063    child_id: &panproto_gat::Name,
1064    content: &Production,
1065    out: &mut Output<'_>,
1066) -> Result<(), ParseError> {
1067    // Leaf shortcut: if the child has a literal-value and no
1068    // structural children, emit the captured text. Identifiers and
1069    // similar terminals reach here when an ALIAS wraps a SYMBOL that
1070    // resolves to a PATTERN.
1071    if let Some(literal) = literal_value(schema, child_id) {
1072        if children_for(schema, child_id).is_empty() {
1073            out.token(literal);
1074            return Ok(());
1075        }
1076    }
1077
1078    // Resolve `content` to a rule when it's a SYMBOL (the dominant
1079    // shape: `ALIAS { content: SYMBOL real_rule, value: "kind_x" }`).
1080    if let Production::Symbol { name } = content {
1081        if let Some(rule) = grammar.rules.get(name) {
1082            let edges = children_for(schema, child_id);
1083            let mut cursor = ChildCursor::new(&edges);
1084            return emit_production(protocol, schema, grammar, child_id, rule, &mut cursor, out);
1085        }
1086    }
1087
1088    // Other ALIAS contents (CHOICE, SEQ, literals) walk in place.
1089    let edges = children_for(schema, child_id);
1090    let mut cursor = ChildCursor::new(&edges);
1091    emit_production(
1092        protocol,
1093        schema,
1094        grammar,
1095        child_id,
1096        content,
1097        &mut cursor,
1098        out,
1099    )
1100}
1101
1102fn emit_in_child_context(
1103    protocol: &str,
1104    schema: &Schema,
1105    grammar: &Grammar,
1106    child_id: &panproto_gat::Name,
1107    production: &Production,
1108    out: &mut Output<'_>,
1109) -> Result<(), ParseError> {
1110    // If `production` is a structural wrapper (CHOICE / SEQ /
1111    // OPTIONAL / ...) whose referenced symbols cover the child's own
1112    // kind, the child IS the production's target node and the right
1113    // emit path is `emit_vertex(child)` (which honours the
1114    // literal-value leaf shortcut). Without this guard, FIELD(pattern,
1115    // CHOICE { _pattern, self }) on an identifier child walks the
1116    // CHOICE on the identifier's empty cursor, falls through to the
1117    // first non-BLANK alt, and loses the captured identifier text.
1118    if !matches!(production, Production::Symbol { .. }) {
1119        let child_kind = schema.vertices.get(child_id).map(|v| v.kind.as_ref());
1120        let symbols = referenced_symbols(production);
1121        if symbols
1122            .iter()
1123            .any(|s| kind_satisfies_symbol(grammar, child_kind, s) || child_kind == Some(s))
1124        {
1125            return emit_vertex(protocol, schema, grammar, child_id, out);
1126        }
1127    }
1128    match production {
1129        Production::Symbol { .. } => emit_vertex(protocol, schema, grammar, child_id, out),
1130        _ => {
1131            let edges = children_for(schema, child_id);
1132            let mut cursor = ChildCursor::new(&edges);
1133            emit_production(
1134                protocol,
1135                schema,
1136                grammar,
1137                child_id,
1138                production,
1139                &mut cursor,
1140                out,
1141            )
1142        }
1143    }
1144}
1145
1146fn pick_choice_with_cursor<'a>(
1147    schema: &Schema,
1148    grammar: &Grammar,
1149    vertex_id: &panproto_gat::Name,
1150    cursor: &ChildCursor<'_>,
1151    alternatives: &'a [Production],
1152) -> Option<&'a Production> {
1153    // Discriminator-driven dispatch (highest priority): when the
1154    // walker recorded a `chose-alt-fingerprint` constraint at parse
1155    // time, dispatch directly against that. This is the categorical
1156    // discriminator: it survives stripping of byte-position
1157    // constraints (so by-construction round-trips work) and is the
1158    // explicit witness of which CHOICE alternative the parser took.
1159    //
1160    // Falls back to the live `interstitial-*` substring blob when no
1161    // fingerprint is present (e.g. instances built by callers that
1162    // bypass the AstWalker). Both blobs are scored by the longest
1163    // STRING-literal token in an alternative that matches; the
1164    // length tiebreak prefers `&&` over `&`, `==` over `=`, etc.
1165    let constraint_blob = schema
1166        .constraints
1167        .get(vertex_id)
1168        .map(|cs| {
1169            let fingerprint: Option<&str> = cs
1170                .iter()
1171                .find(|c| c.sort.as_ref() == "chose-alt-fingerprint")
1172                .map(|c| c.value.as_str());
1173            if let Some(fp) = fingerprint {
1174                fp.to_owned()
1175            } else {
1176                cs.iter()
1177                    .filter(|c| {
1178                        let s = c.sort.as_ref();
1179                        s.starts_with("interstitial-") && !s.ends_with("-start-byte")
1180                    })
1181                    .map(|c| c.value.as_str())
1182                    .collect::<Vec<&str>>()
1183                    .join(" ")
1184            }
1185        })
1186        .unwrap_or_default();
1187    let child_kinds: Vec<&str> = schema
1188        .constraints
1189        .get(vertex_id)
1190        .and_then(|cs| {
1191            cs.iter()
1192                .find(|c| c.sort.as_ref() == "chose-alt-child-kinds")
1193                .map(|c| c.value.split_whitespace().collect())
1194        })
1195        .unwrap_or_default();
1196    if !constraint_blob.is_empty() {
1197        // Primary score: literal-token match length. This dominates
1198        // alt selection so existing language tests that depend on
1199        // literal-only fingerprints keep working.
1200        // Secondary score (tiebreaker only): named-symbol kind match
1201        // count, read from the separate `chose-alt-child-kinds`
1202        // constraint (kept apart from the literal fingerprint so
1203        // identifiers like `:` in the kind list don't contaminate the
1204        // literal match). An alt that matches the recorded kinds is a
1205        // stronger witness than one whose only
1206        // overlap is literal punctuation.
1207        let mut best_literal: usize = 0;
1208        let mut best_symbols: usize = 0;
1209        let mut best_alt: Option<&Production> = None;
1210        let mut tied = false;
1211        for alt in alternatives {
1212            let strings = literal_strings(alt);
1213            if strings.is_empty() {
1214                continue;
1215            }
1216            let literal_score = strings
1217                .iter()
1218                .filter(|s| constraint_blob.contains(s.as_str()))
1219                .map(String::len)
1220                .sum::<usize>();
1221            if literal_score == 0 {
1222                continue;
1223            }
1224            // Symbol score is computed only as a tiebreaker among alts
1225            // whose literal-token coverage is the same; it never lifts
1226            // an alt above one with a strictly higher literal score.
1227            // Reads the `chose-alt-child-kinds` constraint (a separate
1228            // sequence the walker emits, kept apart from the literal
1229            // fingerprint to avoid cross-contamination).
1230            let symbol_score = if literal_score >= best_literal && !child_kinds.is_empty() {
1231                let symbols = referenced_symbols(alt);
1232                symbols
1233                    .iter()
1234                    .filter(|sym| {
1235                        let sym_str: &str = sym;
1236                        if child_kinds.contains(&sym_str) {
1237                            return true;
1238                        }
1239                        grammar.subtypes.get(sym_str).is_some_and(|sub_set| {
1240                            sub_set
1241                                .iter()
1242                                .any(|sub| child_kinds.contains(&sub.as_str()))
1243                        })
1244                    })
1245                    .count()
1246            } else {
1247                0
1248            };
1249            let better = literal_score > best_literal
1250                || (literal_score == best_literal && symbol_score > best_symbols);
1251            let same = literal_score == best_literal && symbol_score == best_symbols;
1252            if better {
1253                best_literal = literal_score;
1254                best_symbols = symbol_score;
1255                best_alt = Some(alt);
1256                tied = false;
1257            } else if same && best_alt.is_some() {
1258                tied = true;
1259            }
1260        }
1261        // Only commit to an alt when the fingerprint discriminates it
1262        // uniquely. A tie means the alts share the same literal token
1263        // set (e.g. JSON's `string = CHOICE { SEQ { '"', '"' }, SEQ {
1264        // '"', _string_content, '"' } }` — both alts contain just the
1265        // two `"` tokens). In that case fall through to cursor-based
1266        // dispatch, which uses the actual edge structure.
1267        if let Some(alt) = best_alt {
1268            if !tied {
1269                return Some(alt);
1270            }
1271        }
1272    }
1273
1274    // Cursor-driven dispatch: pick the alternative whose body
1275    // references at least one SYMBOL whose target kind is present in
1276    // the unconsumed cursor edges. `referenced_symbols` walks the
1277    // alternative recursively (across nested SEQs, REPEATs, OPTIONALs,
1278    // FIELDs, etc.) so a leading optional like `attribute_item` does
1279    // not block matching when only the trailing required symbol is
1280    // present on the schema.
1281    for alt in alternatives {
1282        let symbols = referenced_symbols(alt);
1283        if !symbols.is_empty()
1284            && cursor.has_matching(|edge| {
1285                let tk = schema.vertices.get(&edge.tgt).map(|v| v.kind.as_ref());
1286                symbols
1287                    .iter()
1288                    .any(|s| kind_satisfies_symbol(grammar, tk, s))
1289            })
1290        {
1291            return Some(alt);
1292        }
1293    }
1294
1295    // FIELD dispatch: pick an alternative whose FIELD name matches an
1296    // unconsumed edge kind.
1297    let edge_kinds: Vec<&str> = cursor
1298        .edges
1299        .iter()
1300        .enumerate()
1301        .filter(|(i, _)| !cursor.consumed[*i])
1302        .map(|(_, e)| e.kind.as_ref())
1303        .collect();
1304    for alt in alternatives {
1305        if has_field_in(alt, &edge_kinds) {
1306            return Some(alt);
1307        }
1308    }
1309
1310    // No cursor-driven match. Fall back to:
1311    //
1312    // - BLANK (the explicit empty alternative) when present, so an
1313    //   OPTIONAL-shaped CHOICE compiles to nothing.
1314    // - The first non-`BLANK` alternative as a last resort, used by
1315    //   STRING-only alternatives (keyword tokens) and other choices
1316    //   that don't reach the cursor.
1317    //
1318    // The previous "match own_kind" branch is intentionally absent:
1319    // when an alt's first SYMBOL equals the current vertex's kind, the
1320    // caller is already emitting that vertex's own rule. Recursing
1321    // into the alt would cause a self-loop in the rule walk.
1322    let _ = (schema, vertex_id);
1323    if alternatives.iter().any(|a| matches!(a, Production::Blank)) {
1324        return alternatives.iter().find(|a| matches!(a, Production::Blank));
1325    }
1326    alternatives
1327        .iter()
1328        .find(|alt| !matches!(alt, Production::Blank))
1329}
1330
1331/// Collect every literal STRING token directly inside `production`
1332/// (without descending into SYMBOLs / hidden rules). Used to score
1333/// CHOICE alternatives against the parent vertex's interstitials so
1334/// the right operator / keyword form is picked when the schema
1335/// preserves interstitial fragments from a prior parse.
1336fn literal_strings(production: &Production) -> Vec<String> {
1337    let mut out = Vec::new();
1338    fn walk(p: &Production, out: &mut Vec<String>) {
1339        match p {
1340            Production::String { value } if !value.is_empty() => {
1341                out.push(value.clone());
1342            }
1343            Production::Choice { members } | Production::Seq { members } => {
1344                for m in members {
1345                    walk(m, out);
1346                }
1347            }
1348            Production::Repeat { content }
1349            | Production::Repeat1 { content }
1350            | Production::Optional { content }
1351            | Production::Field { content, .. }
1352            | Production::Alias { content, .. }
1353            | Production::Token { content }
1354            | Production::ImmediateToken { content }
1355            | Production::Prec { content, .. }
1356            | Production::PrecLeft { content, .. }
1357            | Production::PrecRight { content, .. }
1358            | Production::PrecDynamic { content, .. } => walk(content, out),
1359            _ => {}
1360        }
1361    }
1362    walk(production, &mut out);
1363    out
1364}
1365
1366/// Collect every SYMBOL name reachable from `production` without
1367/// crossing into nested rules. Used by `pick_choice_with_cursor` to
1368/// rank alternatives by "any SYMBOL inside this alt matches something
1369/// on the cursor", instead of just the first SYMBOL: a leading
1370/// optional like `attribute_item` then `parameter` is otherwise
1371/// rejected when only the parameter children are present.
1372fn referenced_symbols(production: &Production) -> Vec<&str> {
1373    let mut out = Vec::new();
1374    fn walk<'a>(p: &'a Production, out: &mut Vec<&'a str>) {
1375        match p {
1376            Production::Symbol { name } => out.push(name.as_str()),
1377            Production::Choice { members } | Production::Seq { members } => {
1378                for m in members {
1379                    walk(m, out);
1380                }
1381            }
1382            Production::Repeat { content }
1383            | Production::Repeat1 { content }
1384            | Production::Optional { content }
1385            | Production::Field { content, .. }
1386            | Production::Alias { content, .. }
1387            | Production::Token { content }
1388            | Production::ImmediateToken { content }
1389            | Production::Prec { content, .. }
1390            | Production::PrecLeft { content, .. }
1391            | Production::PrecRight { content, .. }
1392            | Production::PrecDynamic { content, .. } => walk(content, out),
1393            _ => {}
1394        }
1395    }
1396    walk(production, &mut out);
1397    out
1398}
1399
1400fn first_symbol(production: &Production) -> Option<&str> {
1401    match production {
1402        Production::Symbol { name } => Some(name),
1403        Production::Seq { members } => members.iter().find_map(first_symbol),
1404        Production::Choice { members } => members.iter().find_map(first_symbol),
1405        Production::Repeat { content }
1406        | Production::Repeat1 { content }
1407        | Production::Optional { content }
1408        | Production::Field { content, .. }
1409        | Production::Alias { content, .. }
1410        | Production::Token { content }
1411        | Production::ImmediateToken { content }
1412        | Production::Prec { content, .. }
1413        | Production::PrecLeft { content, .. }
1414        | Production::PrecRight { content, .. }
1415        | Production::PrecDynamic { content, .. } => first_symbol(content),
1416        _ => None,
1417    }
1418}
1419
1420fn has_field_in(production: &Production, edge_kinds: &[&str]) -> bool {
1421    match production {
1422        Production::Field { name, .. } => edge_kinds.contains(&name.as_str()),
1423        Production::Seq { members } | Production::Choice { members } => {
1424            members.iter().any(|m| has_field_in(m, edge_kinds))
1425        }
1426        Production::Repeat { content }
1427        | Production::Repeat1 { content }
1428        | Production::Optional { content }
1429        | Production::Alias { content, .. }
1430        | Production::Token { content }
1431        | Production::ImmediateToken { content }
1432        | Production::Prec { content, .. }
1433        | Production::PrecLeft { content, .. }
1434        | Production::PrecRight { content, .. }
1435        | Production::PrecDynamic { content, .. } => has_field_in(content, edge_kinds),
1436        _ => false,
1437    }
1438}
1439
1440fn has_relevant_constraint(
1441    production: &Production,
1442    schema: &Schema,
1443    vertex_id: &panproto_gat::Name,
1444) -> bool {
1445    let constraints = match schema.constraints.get(vertex_id) {
1446        Some(c) => c,
1447        None => return false,
1448    };
1449    fn walk(production: &Production, constraints: &[panproto_schema::Constraint]) -> bool {
1450        match production {
1451            Production::String { value } => constraints
1452                .iter()
1453                .any(|c| c.value == *value || c.sort.as_ref() == value),
1454            Production::Field { name, content } => {
1455                constraints.iter().any(|c| c.sort.as_ref() == name) || walk(content, constraints)
1456            }
1457            Production::Seq { members } | Production::Choice { members } => {
1458                members.iter().any(|m| walk(m, constraints))
1459            }
1460            Production::Repeat { content }
1461            | Production::Repeat1 { content }
1462            | Production::Optional { content }
1463            | Production::Alias { content, .. }
1464            | Production::Token { content }
1465            | Production::ImmediateToken { content }
1466            | Production::Prec { content, .. }
1467            | Production::PrecLeft { content, .. }
1468            | Production::PrecRight { content, .. }
1469            | Production::PrecDynamic { content, .. } => walk(content, constraints),
1470            _ => false,
1471        }
1472    }
1473    walk(production, constraints)
1474}
1475
1476fn children_for<'a>(schema: &'a Schema, vertex_id: &panproto_gat::Name) -> Vec<&'a Edge> {
1477    let mut edges: Vec<&Edge> = schema
1478        .edges
1479        .keys()
1480        .filter(|e| &e.src == vertex_id)
1481        .collect();
1482    edges.sort_by_key(|e| {
1483        // Edges with an explicit ordering position come first; remaining
1484        // edges sort lexicographically by (kind, target id) for
1485        // deterministic walks.
1486        let pos = schema.orderings.get(*e).copied().unwrap_or(u32::MAX);
1487        (pos, e.kind.clone(), e.tgt.clone())
1488    });
1489    edges
1490}
1491
1492fn vertex_id_kind<'a>(schema: &'a Schema, vertex_id: &panproto_gat::Name) -> Option<&'a str> {
1493    schema.vertices.get(vertex_id).map(|v| v.kind.as_ref())
1494}
1495
1496fn literal_value<'a>(schema: &'a Schema, vertex_id: &panproto_gat::Name) -> Option<&'a str> {
1497    schema
1498        .constraints
1499        .get(vertex_id)?
1500        .iter()
1501        .find(|c| c.sort.as_ref() == "literal-value")
1502        .map(|c| c.value.as_str())
1503}
1504
1505fn placeholder_for_pattern(pattern: &str) -> String {
1506    // Heuristic placeholder for unconstrained PATTERN terminals.
1507    //
1508    // First handle the "the regex IS a literal escape" cases that
1509    // tree-sitter grammars use as separators (`\n`, `\r\n`, `;`,
1510    // etc.); emitting the matching character is always preferable
1511    // to a `_x` identifier-like placeholder when the surrounding
1512    // grammar expects a separator.
1513    let simple_lit = decode_simple_pattern_literal(pattern);
1514    if let Some(lit) = simple_lit {
1515        return lit;
1516    }
1517
1518    if pattern.contains("[0-9]") || pattern.contains("\\d") {
1519        "0".into()
1520    } else if pattern.contains("[a-zA-Z_]") || pattern.contains("\\w") {
1521        "_x".into()
1522    } else if pattern.contains('"') || pattern.contains('\'') {
1523        "\"\"".into()
1524    } else {
1525        "_".into()
1526    }
1527}
1528
1529/// Decode a tree-sitter PATTERN whose regex is a simple literal
1530/// (newline, semicolon, comma, etc.) to the byte sequence it matches.
1531/// Returns `None` for patterns with character classes, alternations,
1532/// or quantifiers; the caller falls back to the heuristic placeholder.
1533fn decode_simple_pattern_literal(pattern: &str) -> Option<String> {
1534    // Skip patterns containing regex metachars that would broaden the
1535    // match beyond a single literal byte sequence.
1536    if pattern
1537        .chars()
1538        .any(|c| matches!(c, '[' | ']' | '(' | ')' | '*' | '+' | '?' | '|' | '{' | '}'))
1539    {
1540        return None;
1541    }
1542    let mut out = String::new();
1543    let mut chars = pattern.chars();
1544    while let Some(c) = chars.next() {
1545        if c == '\\' {
1546            match chars.next() {
1547                Some('n') => out.push('\n'),
1548                Some('r') => out.push('\r'),
1549                Some('t') => out.push('\t'),
1550                Some('\\') => out.push('\\'),
1551                Some('/') => out.push('/'),
1552                Some(other) => out.push(other),
1553                None => return None,
1554            }
1555        } else {
1556            out.push(c);
1557        }
1558    }
1559    Some(out)
1560}
1561
1562// ═══════════════════════════════════════════════════════════════════
1563// Token list output with Spacing algebra
1564// ═══════════════════════════════════════════════════════════════════
1565//
1566// Emit produces a free monoid over `Token`. Layout (spaces, newlines,
1567// indentation) is a homomorphism `Vec<Token> -> Vec<u8>` parameterised
1568// by `FormatPolicy`. Separating the structural output from the layout
1569// decision means each phase has one job: emit walks the grammar and
1570// pushes tokens; layout is a single fold, locally driven by adjacent
1571// pairs and a depth counter. Snapshot/restore is just `tokens.len()`.
1572
1573#[derive(Clone)]
1574enum Token {
1575    /// A user-visible terminal contributed by the grammar.
1576    Lit(String),
1577    /// `indent_open` marker emitted when a `Lit` matched the policy's
1578    /// open list. Carried as a separate token so layout can decide to
1579    /// break + indent without re-scanning.
1580    IndentOpen,
1581    /// `indent_close` marker emitted before a closer-`Lit`.
1582    IndentClose,
1583    /// "Break a line here if not already at line start" — used after
1584    /// statements/declarations and after open braces.
1585    LineBreak,
1586}
1587
1588struct Output<'a> {
1589    tokens: Vec<Token>,
1590    policy: &'a FormatPolicy,
1591}
1592
1593#[derive(Clone)]
1594struct OutputSnapshot {
1595    tokens_len: usize,
1596}
1597
1598impl<'a> Output<'a> {
1599    fn new(policy: &'a FormatPolicy) -> Self {
1600        Self {
1601            tokens: Vec::new(),
1602            policy,
1603        }
1604    }
1605
1606    fn token(&mut self, value: &str) {
1607        if value.is_empty() {
1608            return;
1609        }
1610
1611        if self.policy.indent_close.iter().any(|t| t == value) {
1612            self.tokens.push(Token::IndentClose);
1613        }
1614
1615        self.tokens.push(Token::Lit(value.to_owned()));
1616
1617        if self.policy.indent_open.iter().any(|t| t == value) {
1618            self.tokens.push(Token::IndentOpen);
1619            self.tokens.push(Token::LineBreak);
1620        } else if self.policy.line_break_after.iter().any(|t| t == value) {
1621            self.tokens.push(Token::LineBreak);
1622        }
1623    }
1624
1625    fn newline(&mut self) {
1626        self.tokens.push(Token::LineBreak);
1627    }
1628
1629    fn snapshot(&self) -> OutputSnapshot {
1630        OutputSnapshot {
1631            tokens_len: self.tokens.len(),
1632        }
1633    }
1634
1635    fn restore(&mut self, snap: OutputSnapshot) {
1636        self.tokens.truncate(snap.tokens_len);
1637    }
1638
1639    fn finish(self) -> Vec<u8> {
1640        layout(&self.tokens, self.policy)
1641    }
1642}
1643
1644/// Fold a token list into bytes. The algebra:
1645/// * adjacent `Lit`s get a single space iff `needs_space_between(a, b)`,
1646/// * `IndentOpen` / `IndentClose` adjust a depth counter,
1647/// * `LineBreak` writes `\n` if not already at line start, then the
1648///   next `Lit` writes `indent * indent_width` spaces of indent.
1649fn layout(tokens: &[Token], policy: &FormatPolicy) -> Vec<u8> {
1650    let mut bytes = Vec::new();
1651    let mut indent: usize = 0;
1652    let mut at_line_start = true;
1653    let mut last_lit: Option<&str> = None;
1654
1655    for tok in tokens {
1656        match tok {
1657            Token::IndentOpen => indent += 1,
1658            Token::IndentClose => {
1659                indent = indent.saturating_sub(1);
1660                if !at_line_start {
1661                    bytes.push(b'\n');
1662                    at_line_start = true;
1663                }
1664            }
1665            Token::LineBreak => {
1666                if !at_line_start {
1667                    bytes.push(b'\n');
1668                    at_line_start = true;
1669                }
1670            }
1671            Token::Lit(value) => {
1672                if at_line_start {
1673                    bytes.extend(std::iter::repeat_n(b' ', indent * policy.indent_width));
1674                } else if let Some(prev) = last_lit {
1675                    if needs_space_between(prev, value) {
1676                        bytes.push(b' ');
1677                    }
1678                }
1679                bytes.extend_from_slice(value.as_bytes());
1680                at_line_start = false;
1681                last_lit = Some(value.as_str());
1682            }
1683        }
1684    }
1685
1686    if !at_line_start {
1687        bytes.push(b'\n');
1688    }
1689    bytes
1690}
1691
1692fn needs_space_between(last: &str, next: &str) -> bool {
1693    if last.is_empty() || next.is_empty() {
1694        return false;
1695    }
1696    if is_punct_open(last) || is_punct_open(next) {
1697        return false;
1698    }
1699    if is_punct_close(next) {
1700        return false;
1701    }
1702    if is_punct_close(last) && is_punct_punctuation(next) {
1703        return false;
1704    }
1705    if last == "." || next == "." {
1706        return false;
1707    }
1708    if last_is_word_like(last) && first_is_word_like(next) {
1709        return true;
1710    }
1711    if last_ends_with_alnum(last) && first_is_alnum_or_underscore(next) {
1712        return true;
1713    }
1714    // Adjacent operator runs: keep them apart so the lexer doesn't glue
1715    // `>` and `=` into `>=` unintentionally.
1716    true
1717}
1718
1719fn is_punct_open(s: &str) -> bool {
1720    matches!(s, "(" | "[" | "{" | "\"" | "'" | "`")
1721}
1722
1723fn is_punct_close(s: &str) -> bool {
1724    matches!(s, ")" | "]" | "}" | "," | ";" | ":" | "\"" | "'" | "`")
1725}
1726
1727fn is_punct_punctuation(s: &str) -> bool {
1728    matches!(s, "," | ";" | ":" | "." | ")" | "]" | "}")
1729}
1730
1731fn last_is_word_like(s: &str) -> bool {
1732    s.chars()
1733        .next_back()
1734        .map(|c| c.is_alphanumeric() || c == '_')
1735        .unwrap_or(false)
1736}
1737
1738fn first_is_word_like(s: &str) -> bool {
1739    s.chars()
1740        .next()
1741        .map(|c| c.is_alphanumeric() || c == '_')
1742        .unwrap_or(false)
1743}
1744
1745fn last_ends_with_alnum(s: &str) -> bool {
1746    s.chars()
1747        .next_back()
1748        .map(char::is_alphanumeric)
1749        .unwrap_or(false)
1750}
1751
1752fn first_is_alnum_or_underscore(s: &str) -> bool {
1753    s.chars()
1754        .next()
1755        .map(|c| c.is_alphanumeric() || c == '_')
1756        .unwrap_or(false)
1757}
1758
1759#[cfg(test)]
1760mod tests {
1761    use super::*;
1762
1763    #[test]
1764    fn parses_simple_grammar_json() {
1765        let bytes = br#"{
1766            "name": "tiny",
1767            "rules": {
1768                "program": {
1769                    "type": "SEQ",
1770                    "members": [
1771                        {"type": "STRING", "value": "hello"},
1772                        {"type": "STRING", "value": ";"}
1773                    ]
1774                }
1775            }
1776        }"#;
1777        let g = Grammar::from_bytes("tiny", bytes).expect("valid tiny grammar");
1778        assert!(g.rules.contains_key("program"));
1779    }
1780
1781    #[test]
1782    fn output_emits_punctuation_without_leading_space() {
1783        let policy = FormatPolicy::default();
1784        let mut out = Output::new(&policy);
1785        out.token("foo");
1786        out.token("(");
1787        out.token(")");
1788        out.token(";");
1789        let bytes = out.finish();
1790        let s = std::str::from_utf8(&bytes).expect("ascii output");
1791        assert!(s.starts_with("foo();"), "got {s:?}");
1792    }
1793
1794    #[test]
1795    fn grammar_from_bytes_rejects_malformed_input() {
1796        let result = Grammar::from_bytes("malformed", b"not json");
1797        let err = result.expect_err("malformed bytes must yield Err");
1798        let msg = err.to_string();
1799        assert!(
1800            msg.contains("malformed"),
1801            "error message should name the protocol: {msg:?}"
1802        );
1803    }
1804
1805    #[test]
1806    fn output_indents_after_open_brace() {
1807        let policy = FormatPolicy::default();
1808        let mut out = Output::new(&policy);
1809        out.token("fn");
1810        out.token("foo");
1811        out.token("(");
1812        out.token(")");
1813        out.token("{");
1814        out.token("body");
1815        out.token("}");
1816        let bytes = out.finish();
1817        let s = std::str::from_utf8(&bytes).expect("ascii output");
1818        assert!(s.contains("{\n"), "newline after opening brace: {s:?}");
1819        assert!(s.contains("body"), "body inside block: {s:?}");
1820        assert!(s.ends_with("}\n"), "newline after closing brace: {s:?}");
1821    }
1822
1823    #[test]
1824    fn output_no_space_between_word_and_dot() {
1825        let policy = FormatPolicy::default();
1826        let mut out = Output::new(&policy);
1827        out.token("foo");
1828        out.token(".");
1829        out.token("bar");
1830        let bytes = out.finish();
1831        let s = std::str::from_utf8(&bytes).expect("ascii output");
1832        assert!(s.starts_with("foo.bar"), "no space around dot: {s:?}");
1833    }
1834
1835    #[test]
1836    fn output_snapshot_restore_truncates_bytes() {
1837        let policy = FormatPolicy::default();
1838        let mut out = Output::new(&policy);
1839        out.token("keep");
1840        let snap = out.snapshot();
1841        out.token("drop");
1842        out.token("more");
1843        out.restore(snap);
1844        out.token("after");
1845        let bytes = out.finish();
1846        let s = std::str::from_utf8(&bytes).expect("ascii output");
1847        assert!(s.contains("keep"), "kept token survives: {s:?}");
1848        assert!(s.contains("after"), "post-restore token visible: {s:?}");
1849        assert!(!s.contains("drop"), "rolled-back token removed: {s:?}");
1850        assert!(!s.contains("more"), "rolled-back token removed: {s:?}");
1851    }
1852
1853    #[test]
1854    fn child_cursor_take_field_consumes_once() {
1855        let edges_owned: Vec<Edge> = vec![Edge {
1856            src: panproto_gat::Name::from("p"),
1857            tgt: panproto_gat::Name::from("c"),
1858            kind: panproto_gat::Name::from("name"),
1859            name: None,
1860        }];
1861        let edges: Vec<&Edge> = edges_owned.iter().collect();
1862        let mut cursor = ChildCursor::new(&edges);
1863        let first = cursor.take_field("name");
1864        let second = cursor.take_field("name");
1865        assert!(first.is_some(), "first take returns the edge");
1866        assert!(
1867            second.is_none(),
1868            "second take returns None (already consumed)"
1869        );
1870    }
1871
1872    #[test]
1873    fn child_cursor_take_matching_predicate() {
1874        let edges_owned: Vec<Edge> = vec![
1875            Edge {
1876                src: "p".into(),
1877                tgt: "c1".into(),
1878                kind: "child_of".into(),
1879                name: None,
1880            },
1881            Edge {
1882                src: "p".into(),
1883                tgt: "c2".into(),
1884                kind: "key".into(),
1885                name: None,
1886            },
1887        ];
1888        let edges: Vec<&Edge> = edges_owned.iter().collect();
1889        let mut cursor = ChildCursor::new(&edges);
1890        assert!(cursor.has_matching(|e| e.kind.as_ref() == "key"));
1891        let taken = cursor.take_matching(|e| e.kind.as_ref() == "key");
1892        assert!(taken.is_some());
1893        assert!(
1894            !cursor.has_matching(|e| e.kind.as_ref() == "key"),
1895            "consumed edge no longer matches"
1896        );
1897        assert!(
1898            cursor.has_matching(|e| e.kind.as_ref() == "child_of"),
1899            "the other edge is still available"
1900        );
1901    }
1902
1903    #[test]
1904    fn kind_satisfies_symbol_direct_match() {
1905        let bytes = br#"{
1906            "name": "tiny",
1907            "rules": {
1908                "x": {"type": "STRING", "value": "x"}
1909            }
1910        }"#;
1911        let g = Grammar::from_bytes("tiny", bytes).expect("valid grammar");
1912        assert!(kind_satisfies_symbol(&g, Some("x"), "x"));
1913        assert!(!kind_satisfies_symbol(&g, Some("y"), "x"));
1914        assert!(!kind_satisfies_symbol(&g, None, "x"));
1915    }
1916
1917    #[test]
1918    fn kind_satisfies_symbol_through_hidden_rule() {
1919        let bytes = br#"{
1920            "name": "tiny",
1921            "rules": {
1922                "_value": {
1923                    "type": "CHOICE",
1924                    "members": [
1925                        {"type": "SYMBOL", "name": "object"},
1926                        {"type": "SYMBOL", "name": "number"}
1927                    ]
1928                },
1929                "object": {"type": "STRING", "value": "{}"},
1930                "number": {"type": "PATTERN", "value": "[0-9]+"}
1931            }
1932        }"#;
1933        let g = Grammar::from_bytes("tiny", bytes).expect("valid grammar");
1934        assert!(
1935            kind_satisfies_symbol(&g, Some("number"), "_value"),
1936            "number is reachable from _value via CHOICE"
1937        );
1938        assert!(
1939            kind_satisfies_symbol(&g, Some("object"), "_value"),
1940            "object is reachable from _value via CHOICE"
1941        );
1942        assert!(
1943            !kind_satisfies_symbol(&g, Some("string"), "_value"),
1944            "string is NOT among the alternatives"
1945        );
1946    }
1947
1948    #[test]
1949    fn first_symbol_skips_string_terminals() {
1950        let prod: Production = serde_json::from_str(
1951            r#"{
1952                "type": "SEQ",
1953                "members": [
1954                    {"type": "STRING", "value": "{"},
1955                    {"type": "SYMBOL", "name": "body"},
1956                    {"type": "STRING", "value": "}"}
1957                ]
1958            }"#,
1959        )
1960        .expect("valid SEQ");
1961        assert_eq!(first_symbol(&prod), Some("body"));
1962    }
1963
1964    #[test]
1965    fn placeholder_for_pattern_routes_by_regex_class() {
1966        assert_eq!(placeholder_for_pattern("[0-9]+"), "0");
1967        assert_eq!(placeholder_for_pattern("[a-zA-Z_]\\w*"), "_x");
1968        assert_eq!(placeholder_for_pattern("\"[^\"]*\""), "\"\"");
1969        assert_eq!(placeholder_for_pattern("\\d+\\.\\d+"), "0");
1970    }
1971
1972    #[test]
1973    fn format_policy_default_breaks_after_semicolon() {
1974        let policy = FormatPolicy::default();
1975        assert!(policy.line_break_after.iter().any(|t| t == ";"));
1976        assert!(policy.indent_open.iter().any(|t| t == "{"));
1977        assert!(policy.indent_close.iter().any(|t| t == "}"));
1978        assert_eq!(policy.indent_width, 2);
1979    }
1980
1981    #[test]
1982    fn placeholder_decodes_literal_pattern_separators() {
1983        // PATTERN regexes that match a single literal byte sequence
1984        // (newline, semicolon, comma) emit the bytes verbatim instead
1985        // of falling through to the `_` catch-all.
1986        assert_eq!(placeholder_for_pattern("\\n"), "\n");
1987        assert_eq!(placeholder_for_pattern("\\r\\n"), "\r\n");
1988        assert_eq!(placeholder_for_pattern(";"), ";");
1989        // Patterns with character classes / alternation still route
1990        // through the heuristic.
1991        assert_eq!(placeholder_for_pattern("[0-9]+"), "0");
1992        assert_eq!(placeholder_for_pattern("a|b"), "_");
1993    }
1994
1995    #[test]
1996    fn supertypes_decode_from_grammar_json_strings() {
1997        // Tree-sitter older grammars list supertypes as bare strings.
1998        let bytes = br#"{
1999            "name": "tiny",
2000            "supertypes": ["expression"],
2001            "rules": {
2002                "expression": {
2003                    "type": "CHOICE",
2004                    "members": [
2005                        {"type": "SYMBOL", "name": "binary_expression"},
2006                        {"type": "SYMBOL", "name": "identifier"}
2007                    ]
2008                },
2009                "binary_expression": {"type": "STRING", "value": "x"},
2010                "identifier": {"type": "PATTERN", "value": "[a-z]+"}
2011            }
2012        }"#;
2013        let g = Grammar::from_bytes("tiny", bytes).expect("parse");
2014        assert!(g.supertypes.contains("expression"));
2015        // identifier matches the supertype `expression`.
2016        assert!(kind_satisfies_symbol(&g, Some("identifier"), "expression"));
2017        // unrelated kinds do not.
2018        assert!(!kind_satisfies_symbol(&g, Some("string"), "expression"));
2019    }
2020
2021    #[test]
2022    fn supertypes_decode_from_grammar_json_objects() {
2023        // Recent grammars list supertypes as `{type: SYMBOL, name: ...}`
2024        // entries instead of bare strings.
2025        let bytes = br#"{
2026            "name": "tiny",
2027            "supertypes": [{"type": "SYMBOL", "name": "stmt"}],
2028            "rules": {
2029                "stmt": {
2030                    "type": "CHOICE",
2031                    "members": [
2032                        {"type": "SYMBOL", "name": "while_stmt"},
2033                        {"type": "SYMBOL", "name": "if_stmt"}
2034                    ]
2035                },
2036                "while_stmt": {"type": "STRING", "value": "while"},
2037                "if_stmt": {"type": "STRING", "value": "if"}
2038            }
2039        }"#;
2040        let g = Grammar::from_bytes("tiny", bytes).expect("parse");
2041        assert!(g.supertypes.contains("stmt"));
2042        assert!(kind_satisfies_symbol(&g, Some("while_stmt"), "stmt"));
2043    }
2044
2045    #[test]
2046    fn alias_value_matches_kind() {
2047        // A named ALIAS rewrites the parser-visible kind to `value`;
2048        // `kind_satisfies_symbol` should accept that rewritten kind
2049        // when looking up the original SYMBOL.
2050        let bytes = br#"{
2051            "name": "tiny",
2052            "rules": {
2053                "_package_identifier": {
2054                    "type": "ALIAS",
2055                    "named": true,
2056                    "value": "package_identifier",
2057                    "content": {"type": "SYMBOL", "name": "identifier"}
2058                },
2059                "identifier": {"type": "PATTERN", "value": "[a-z]+"}
2060            }
2061        }"#;
2062        let g = Grammar::from_bytes("tiny", bytes).expect("parse");
2063        assert!(kind_satisfies_symbol(
2064            &g,
2065            Some("package_identifier"),
2066            "_package_identifier"
2067        ));
2068    }
2069
2070    #[test]
2071    fn referenced_symbols_walks_nested_seq() {
2072        let prod: Production = serde_json::from_str(
2073            r#"{
2074                "type": "SEQ",
2075                "members": [
2076                    {"type": "CHOICE", "members": [
2077                        {"type": "SYMBOL", "name": "attribute_item"},
2078                        {"type": "BLANK"}
2079                    ]},
2080                    {"type": "SYMBOL", "name": "parameter"},
2081                    {"type": "REPEAT", "content": {
2082                        "type": "SEQ",
2083                        "members": [
2084                            {"type": "STRING", "value": ","},
2085                            {"type": "SYMBOL", "name": "parameter"}
2086                        ]
2087                    }}
2088                ]
2089            }"#,
2090        )
2091        .expect("seq");
2092        let symbols = referenced_symbols(&prod);
2093        assert!(symbols.contains(&"attribute_item"));
2094        assert!(symbols.contains(&"parameter"));
2095    }
2096
2097    #[test]
2098    fn literal_strings_collects_choice_members() {
2099        let prod: Production = serde_json::from_str(
2100            r#"{
2101                "type": "CHOICE",
2102                "members": [
2103                    {"type": "STRING", "value": "+"},
2104                    {"type": "STRING", "value": "-"},
2105                    {"type": "STRING", "value": "*"}
2106                ]
2107            }"#,
2108        )
2109        .expect("choice");
2110        let strings = literal_strings(&prod);
2111        assert_eq!(strings, vec!["+", "-", "*"]);
2112    }
2113}