panproto_parse/emit_pretty.rs
1#![allow(
2 clippy::module_name_repetitions,
3 clippy::too_many_lines,
4 clippy::too_many_arguments,
5 clippy::map_unwrap_or,
6 clippy::option_if_let_else,
7 clippy::elidable_lifetime_names,
8 clippy::items_after_statements,
9 clippy::needless_pass_by_value,
10 clippy::single_match_else,
11 clippy::manual_let_else,
12 clippy::match_same_arms,
13 clippy::missing_const_for_fn,
14 clippy::single_char_pattern,
15 clippy::naive_bytecount,
16 clippy::expect_used,
17 clippy::redundant_pub_crate,
18 clippy::used_underscore_binding,
19 clippy::redundant_field_names,
20 clippy::struct_field_names,
21 clippy::redundant_else,
22 clippy::similar_names
23)]
24
25//! De-novo source emission from a by-construction schema.
26//!
27//! [`AstParser::emit`] reconstructs source from byte-position fragments
28//! that the parser stored on the schema during `parse`. That works for
29//! edit pipelines (`parse → transform → emit`) but fails for schemas
30//! built by hand (`SchemaBuilder` with no parse history): they carry
31//! no `start-byte`, no `interstitial-N`, no `literal-value`, and the
32//! reconstructor returns `Err(EmitFailed { reason: "schema has no
33//! text fragments" })`.
34//!
35//! This module renders such schemas to source bytes by walking
36//! tree-sitter's `grammar.json` production rules. For each schema
37//! vertex of kind `K`, the walker looks up `K`'s production in the
38//! grammar and emits its body in order:
39//!
40//! - `STRING` nodes contribute literal token bytes directly.
41//! - `SYMBOL` and `FIELD` nodes recurse into the schema's children,
42//! matching by edge kind (which is the tree-sitter field name).
43//! - `SEQ` emits its members in order.
44//! - `CHOICE` picks the alternative whose head `SYMBOL` matches an
45//! actual child kind, or whose terminals appear in the rendered
46//! prefix; falls back to the first non-`BLANK` alternative when no
47//! alternative matches.
48//! - `REPEAT` and `REPEAT1` emit their content once per matching
49//! child edge in declared order.
50//! - `OPTIONAL` emits its content iff a corresponding child edge or
51//! constraint is populated.
52//! - `PATTERN` is a regex placeholder for variable-text terminals
53//! (identifiers, numbers, quoted strings). The walker emits a
54//! `literal-value` constraint when present and otherwise falls
55//! back to a placeholder derived from the regex shape.
56//! - `BLANK`, `TOKEN`, `IMMEDIATE_TOKEN`, `ALIAS`, `PREC*` are
57//! handled transparently (the inner content is emitted; the
58//! wrapper is dropped).
59//!
60//! Whitespace and indentation come from a `FormatPolicy` applied
61//! during emission. The default policy inserts a single space between
62//! adjacent tokens, a newline after `;` / `}` / `{`, and tracks an
63//! indent counter on `{` / `}` boundaries.
64//!
65//! Output is *syntactically valid* for any grammar that ships
66//! `grammar.json`. Idiomatic formatting (rustfmt-style spacing rules,
67//! per-language conventions) is a polish layer that lives outside
68//! this module.
69
70use std::collections::BTreeMap;
71
72use panproto_schema::{Edge, Schema};
73use serde::Deserialize;
74
75use crate::error::ParseError;
76
77// ═══════════════════════════════════════════════════════════════════
78// Grammar JSON model
79// ═══════════════════════════════════════════════════════════════════
80
81/// A single tree-sitter production rule.
82///
83/// Mirrors the shape emitted by `tree-sitter generate`: every node has
84/// a `type` discriminator that selects a structural variant. The
85/// untyped subset (`PATTERN`, `STRING`, `SYMBOL`, `BLANK`) handles
86/// terminals; the structural subset (`SEQ`, `CHOICE`, `REPEAT`,
87/// `REPEAT1`, `OPTIONAL`, `FIELD`, `ALIAS`, `TOKEN`,
88/// `IMMEDIATE_TOKEN`, `PREC*`) builds composite productions.
89#[derive(Debug, Clone, Deserialize)]
90#[serde(tag = "type")]
91#[non_exhaustive]
92pub enum Production {
93 /// Concatenation of productions.
94 #[serde(rename = "SEQ")]
95 Seq {
96 /// Ordered members; each is emitted in turn.
97 members: Vec<Self>,
98 },
99 /// Alternation between productions.
100 #[serde(rename = "CHOICE")]
101 Choice {
102 /// Alternatives; the walker picks one based on the schema's
103 /// children and constraints.
104 members: Vec<Self>,
105 },
106 /// Zero-or-more repetition.
107 #[serde(rename = "REPEAT")]
108 Repeat {
109 /// The repeated body.
110 content: Box<Self>,
111 },
112 /// One-or-more repetition.
113 #[serde(rename = "REPEAT1")]
114 Repeat1 {
115 /// The repeated body.
116 content: Box<Self>,
117 },
118 /// Optional inclusion (zero or one).
119 ///
120 /// Tree-sitter usually emits `OPTIONAL` as `CHOICE { content,
121 /// BLANK }`, but recent generator versions also emit explicit
122 /// `OPTIONAL` nodes; both shapes are accepted.
123 #[serde(rename = "OPTIONAL")]
124 Optional {
125 /// The optional body.
126 content: Box<Self>,
127 },
128 /// Reference to another rule by name.
129 #[serde(rename = "SYMBOL")]
130 Symbol {
131 /// Name of the referenced rule (matches a vertex kind on the
132 /// schema side).
133 name: String,
134 },
135 /// Literal token bytes.
136 #[serde(rename = "STRING")]
137 String {
138 /// The literal token. Emitted verbatim.
139 value: String,
140 },
141 /// Regex-matched terminal.
142 ///
143 /// At parse time this matches arbitrary bytes; at emit time the
144 /// walker substitutes a `literal-value` constraint when present
145 /// and falls back to a placeholder otherwise.
146 #[serde(rename = "PATTERN")]
147 Pattern {
148 /// The original regex.
149 value: String,
150 },
151 /// The empty production. Emits nothing.
152 #[serde(rename = "BLANK")]
153 Blank,
154 /// Named field over a content production.
155 ///
156 /// The field `name` matches an edge kind on the schema side; the
157 /// walker resolves the corresponding child vertex and recurses
158 /// into `content` with that child as context.
159 #[serde(rename = "FIELD")]
160 Field {
161 /// Field name (matches edge kind).
162 name: String,
163 /// The contents of the field.
164 content: Box<Self>,
165 },
166 /// An aliased production.
167 ///
168 /// `value` records the parser-visible kind; the walker emits
169 /// `content` and ignores the alias rename.
170 #[serde(rename = "ALIAS")]
171 Alias {
172 /// The aliased content.
173 content: Box<Self>,
174 /// Whether the alias is a named node.
175 #[serde(default)]
176 named: bool,
177 /// The alias's surface name.
178 #[serde(default)]
179 value: String,
180 },
181 /// A token wrapper.
182 ///
183 /// Tree-sitter uses `TOKEN` to mark a sub-rule as a single
184 /// lexical token; the walker emits the inner content unchanged.
185 #[serde(rename = "TOKEN")]
186 Token {
187 /// The wrapped content.
188 content: Box<Self>,
189 },
190 /// An immediate-token wrapper (no preceding whitespace).
191 ///
192 /// Treated like [`Production::Token`] for emit purposes.
193 #[serde(rename = "IMMEDIATE_TOKEN")]
194 ImmediateToken {
195 /// The wrapped content.
196 content: Box<Self>,
197 },
198 /// Precedence wrapper.
199 #[serde(rename = "PREC")]
200 Prec {
201 /// Precedence value (numeric or string). Ignored at emit time.
202 #[allow(dead_code)]
203 value: serde_json::Value,
204 /// The wrapped content.
205 content: Box<Self>,
206 },
207 /// Left-associative precedence wrapper.
208 #[serde(rename = "PREC_LEFT")]
209 PrecLeft {
210 /// Precedence value. Ignored at emit time.
211 #[allow(dead_code)]
212 value: serde_json::Value,
213 /// The wrapped content.
214 content: Box<Self>,
215 },
216 /// Right-associative precedence wrapper.
217 #[serde(rename = "PREC_RIGHT")]
218 PrecRight {
219 /// Precedence value. Ignored at emit time.
220 #[allow(dead_code)]
221 value: serde_json::Value,
222 /// The wrapped content.
223 content: Box<Self>,
224 },
225 /// Dynamic precedence wrapper.
226 #[serde(rename = "PREC_DYNAMIC")]
227 PrecDynamic {
228 /// Precedence value. Ignored at emit time.
229 #[allow(dead_code)]
230 value: serde_json::Value,
231 /// The wrapped content.
232 content: Box<Self>,
233 },
234 /// Reserved-word wrapper (tree-sitter ≥ 0.25).
235 ///
236 /// Tree-sitter's `RESERVED` rule marks an inner production as a
237 /// reserved-word context: the parser excludes the listed identifiers
238 /// from being treated as the inner symbol. The `context_name`
239 /// metadata names the reserved-word set; the emitter does not need
240 /// it (we are walking schema → bytes, not enforcing reserved-word
241 /// constraints), so we emit the inner content unchanged, the same
242 /// way [`Production::Token`] and [`Production::ImmediateToken`] do.
243 #[serde(rename = "RESERVED")]
244 Reserved {
245 /// The wrapped content.
246 content: Box<Self>,
247 /// Name of the reserved-word context. Ignored at emit time.
248 #[allow(dead_code)]
249 #[serde(default)]
250 context_name: String,
251 },
252}
253
254/// A grammar's production-rule table, deserialized from `grammar.json`.
255///
256/// Only the fields the emitter consumes are decoded; precedences,
257/// conflicts, externals, and other parser-only metadata are ignored.
258#[derive(Debug, Clone, Deserialize)]
259pub struct Grammar {
260 /// Grammar name (e.g. `"rust"`, `"typescript"`).
261 #[allow(dead_code)]
262 pub name: String,
263 /// Map from rule name (a vertex kind on the schema side) to
264 /// production. Entries are kept in lexical order so iteration
265 /// is deterministic.
266 pub rules: BTreeMap<String, Production>,
267 /// Supertypes declared in the grammar's `supertypes` field. A
268 /// supertype is a rule whose body is a `CHOICE` of `SYMBOL`
269 /// references; tree-sitter parsers report a node's kind as one
270 /// of the subtypes (e.g. `identifier`, `typed_parameter`) rather
271 /// than the supertype name (`parameter`), so the emitter needs to
272 /// know that a child kind in a subtype set should match the
273 /// supertype name when a SYMBOL references it.
274 #[serde(default, deserialize_with = "deserialize_supertypes")]
275 pub supertypes: std::collections::HashSet<String>,
276 /// Precomputed subtyping closure: `subtypes[symbol_name]` is the
277 /// set of vertex kinds that satisfy a SYMBOL `symbol_name`
278 /// reference on the schema side.
279 ///
280 /// Built once at [`Grammar::from_bytes`] time by walking each
281 /// hidden rule (`_`-prefixed), declared supertype, and named
282 /// `ALIAS { value: K, ... }` production to its leaf SYMBOLs and
283 /// recording the closure. This replaces the prior heuristic
284 /// `kind_satisfies_symbol` that walked the rule body on every
285 /// query: lookups are now O(1) and the relation is exactly the
286 /// transitive closure of "is reachable via hidden / supertype /
287 /// alias dispatch", with no over-expansion through non-hidden
288 /// non-supertype rule references.
289 #[serde(skip)]
290 pub subtypes: std::collections::HashMap<String, std::collections::HashSet<String>>,
291}
292
293fn deserialize_supertypes<'de, D>(
294 deserializer: D,
295) -> Result<std::collections::HashSet<String>, D::Error>
296where
297 D: serde::Deserializer<'de>,
298{
299 let entries: Vec<serde_json::Value> = Vec::deserialize(deserializer)?;
300 let mut out = std::collections::HashSet::new();
301 for entry in entries {
302 match entry {
303 serde_json::Value::String(s) => {
304 out.insert(s);
305 }
306 serde_json::Value::Object(map) => {
307 if let Some(serde_json::Value::String(name)) = map.get("name") {
308 out.insert(name.clone());
309 }
310 }
311 _ => {}
312 }
313 }
314 Ok(out)
315}
316
317impl Grammar {
318 /// Parse a grammar's `grammar.json` bytes.
319 ///
320 /// Builds the subtyping closure as part of construction so every
321 /// downstream lookup is O(1). The closure is the least relation
322 /// containing `(K, K)` for every rule key `K` and closed under:
323 ///
324 /// - hidden-rule expansion: if `S` is hidden and a SYMBOL `S` may
325 /// reach SYMBOL `K`, then `K ⊑ S`.
326 /// - supertype expansion: if `S` is in the grammar's supertypes
327 /// block and `K` is one of `S`'s alternatives, then `K ⊑ S`.
328 /// - alias renaming: if a rule body contains
329 /// `ALIAS { content: SYMBOL R, value: A, named: true }` where
330 /// `R` reaches kind `K` (or `K = R` when no further hop), then
331 /// `A ⊑ R` and `K ⊑ A`.
332 ///
333 /// # Errors
334 ///
335 /// Returns [`ParseError::EmitFailed`] when the bytes are not a
336 /// valid `grammar.json` document.
337 pub fn from_bytes(protocol: &str, bytes: &[u8]) -> Result<Self, ParseError> {
338 let mut grammar: Self =
339 serde_json::from_slice(bytes).map_err(|e| ParseError::EmitFailed {
340 protocol: protocol.to_owned(),
341 reason: format!("grammar.json deserialization failed: {e}"),
342 })?;
343 grammar.subtypes = compute_subtype_closure(&grammar);
344 Ok(grammar)
345 }
346}
347
348/// Compute the subtyping relation as a forward-indexed map from a
349/// SYMBOL name to the set of vertex kinds that satisfy that SYMBOL.
350fn compute_subtype_closure(
351 grammar: &Grammar,
352) -> std::collections::HashMap<String, std::collections::HashSet<String>> {
353 use std::collections::{HashMap, HashSet};
354 // Edges of the "kind X satisfies SYMBOL Y" relation. `K ⊑ Y` is
355 // recorded whenever Y is reached by walking the grammar's
356 // ALIAS / hidden-rule / supertype dispatch from a position where
357 // K is the actual vertex kind.
358 let mut subtypes: HashMap<String, HashSet<String>> = HashMap::new();
359 for name in grammar.rules.keys() {
360 subtypes
361 .entry(name.clone())
362 .or_default()
363 .insert(name.clone());
364 }
365
366 // First pass: collect the immediate "satisfies" edges from each
367 // expandable rule (hidden, supertype) to the kinds reachable by
368 // walking its body, plus alias edges.
369 fn walk<'g>(
370 grammar: &'g Grammar,
371 production: &'g Production,
372 visited: &mut HashSet<&'g str>,
373 out: &mut HashSet<String>,
374 ) {
375 match production {
376 Production::Symbol { name } => {
377 // Direct subtype.
378 out.insert(name.clone());
379 // Continue expansion through hidden / supertype rules
380 // so the closure traverses pass-through dispatch.
381 let expand = name.starts_with('_') || grammar.supertypes.contains(name.as_str());
382 if expand && visited.insert(name.as_str()) {
383 if let Some(rule) = grammar.rules.get(name) {
384 walk(grammar, rule, visited, out);
385 }
386 }
387 }
388 Production::Choice { members } | Production::Seq { members } => {
389 for m in members {
390 walk(grammar, m, visited, out);
391 }
392 }
393 Production::Alias {
394 content,
395 named,
396 value,
397 } => {
398 if *named && !value.is_empty() {
399 out.insert(value.clone());
400 }
401 walk(grammar, content, visited, out);
402 }
403 Production::Repeat { content }
404 | Production::Repeat1 { content }
405 | Production::Optional { content }
406 | Production::Field { content, .. }
407 | Production::Token { content }
408 | Production::ImmediateToken { content }
409 | Production::Prec { content, .. }
410 | Production::PrecLeft { content, .. }
411 | Production::PrecRight { content, .. }
412 | Production::PrecDynamic { content, .. }
413 | Production::Reserved { content, .. } => {
414 walk(grammar, content, visited, out);
415 }
416 _ => {}
417 }
418 }
419
420 for (name, rule) in &grammar.rules {
421 let expand = name.starts_with('_') || grammar.supertypes.contains(name.as_str());
422 if !expand {
423 continue;
424 }
425 let mut visited: HashSet<&str> = HashSet::new();
426 visited.insert(name.as_str());
427 let mut reachable: HashSet<String> = HashSet::new();
428 walk(grammar, rule, &mut visited, &mut reachable);
429 for kind in &reachable {
430 subtypes
431 .entry(kind.clone())
432 .or_default()
433 .insert(name.clone());
434 }
435 }
436
437 // Aliases: scan every rule body for ALIAS { content, value }
438 // declarations. The kinds reachable from `content` satisfy
439 // `value`, AND (by construction) `value` satisfies the
440 // surrounding rule. Walking the ENTIRE grammar once captures
441 // every alias site, irrespective of which rule introduces it.
442 fn collect_aliases<'g>(production: &'g Production, out: &mut Vec<(String, &'g Production)>) {
443 match production {
444 Production::Alias {
445 content,
446 named,
447 value,
448 } => {
449 if *named && !value.is_empty() {
450 out.push((value.clone(), content.as_ref()));
451 }
452 collect_aliases(content, out);
453 }
454 Production::Choice { members } | Production::Seq { members } => {
455 for m in members {
456 collect_aliases(m, out);
457 }
458 }
459 Production::Repeat { content }
460 | Production::Repeat1 { content }
461 | Production::Optional { content }
462 | Production::Field { content, .. }
463 | Production::Token { content }
464 | Production::ImmediateToken { content }
465 | Production::Prec { content, .. }
466 | Production::PrecLeft { content, .. }
467 | Production::PrecRight { content, .. }
468 | Production::PrecDynamic { content, .. }
469 | Production::Reserved { content, .. } => {
470 collect_aliases(content, out);
471 }
472 _ => {}
473 }
474 }
475 let mut aliases: Vec<(String, &Production)> = Vec::new();
476 for rule in grammar.rules.values() {
477 collect_aliases(rule, &mut aliases);
478 }
479 for (alias_value, content) in aliases {
480 let mut visited: HashSet<&str> = HashSet::new();
481 let mut reachable: HashSet<String> = HashSet::new();
482 walk(grammar, content, &mut visited, &mut reachable);
483 // Aliased value satisfies itself and is satisfied by every
484 // kind its content can reach.
485 subtypes
486 .entry(alias_value.clone())
487 .or_default()
488 .insert(alias_value.clone());
489 for kind in reachable {
490 subtypes
491 .entry(kind)
492 .or_default()
493 .insert(alias_value.clone());
494 }
495 }
496
497 // Transitive close: `K ⊑ A` and `A ⊑ B` implies `K ⊑ B`. Iterate
498 // a few rounds; the relation is small so a quick fixed-point
499 // suffices in practice.
500 for _ in 0..8 {
501 let snapshot = subtypes.clone();
502 let mut changed = false;
503 for (kind, supers) in &snapshot {
504 let extra: HashSet<String> = supers
505 .iter()
506 .flat_map(|s| snapshot.get(s).cloned().unwrap_or_default())
507 .collect();
508 let entry = subtypes.entry(kind.clone()).or_default();
509 for s in extra {
510 if entry.insert(s) {
511 changed = true;
512 }
513 }
514 }
515 if !changed {
516 break;
517 }
518 }
519
520 subtypes
521}
522
523// ═══════════════════════════════════════════════════════════════════
524// Format policy
525// ═══════════════════════════════════════════════════════════════════
526
527/// Whitespace and indentation policy applied during emission.
528///
529/// The default policy inserts a single space between adjacent tokens,
530/// a newline after `;` / `}` / `{`, and tracks indent on `{` / `}`
531/// boundaries. Per-language overrides (idiomatic indent width,
532/// trailing-comma rules, blank-line conventions) can ride alongside
533/// this struct in a follow-up branch; today's defaults aim only for
534/// syntactic validity.
535#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
536pub struct FormatPolicy {
537 /// Number of spaces per indent level.
538 pub indent_width: usize,
539 /// Separator inserted between adjacent terminals that the lexer
540 /// would otherwise glue together (word ↔ word, operator ↔ operator).
541 /// Default is a single space.
542 pub separator: String,
543 /// Newline byte sequence emitted after `line_break_after` tokens
544 /// and at end-of-output. Default is `"\n"`.
545 pub newline: String,
546 /// Tokens after which the walker breaks to a new line.
547 pub line_break_after: Vec<String>,
548 /// Tokens that increase indent on emission.
549 pub indent_open: Vec<String>,
550 /// Tokens that decrease indent on emission.
551 pub indent_close: Vec<String>,
552}
553
554impl Default for FormatPolicy {
555 fn default() -> Self {
556 Self {
557 indent_width: 2,
558 separator: " ".to_owned(),
559 newline: "\n".to_owned(),
560 line_break_after: vec![";".into(), "{".into(), "}".into()],
561 indent_open: vec!["{".into()],
562 indent_close: vec!["}".into()],
563 }
564 }
565}
566
567// ═══════════════════════════════════════════════════════════════════
568// Emitter
569// ═══════════════════════════════════════════════════════════════════
570
571/// Emit a by-construction schema to source bytes.
572///
573/// `protocol` is the grammar / language name (used in error messages
574/// and to label the entry point).
575///
576/// The walker treats `schema.entries` as the ordered list of root
577/// vertices, falling back to a deterministic by-id ordering when
578/// `entries` is empty. Each root is emitted using the production
579/// associated with its kind in `grammar.rules`.
580///
581/// # Errors
582///
583/// Returns [`ParseError::EmitFailed`] when:
584///
585/// - the schema has no vertices
586/// - a root vertex's kind is not a grammar rule
587/// - a `SYMBOL` reference points at a kind with no rule and no schema
588/// child to resolve it to
589/// - a required `FIELD` has no corresponding edge in the schema
590pub fn emit_pretty(
591 protocol: &str,
592 schema: &Schema,
593 grammar: &Grammar,
594 policy: &FormatPolicy,
595) -> Result<Vec<u8>, ParseError> {
596 let roots = collect_roots(schema);
597 if roots.is_empty() {
598 return Err(ParseError::EmitFailed {
599 protocol: protocol.to_owned(),
600 reason: "schema has no entry vertices".to_owned(),
601 });
602 }
603
604 let mut out = Output::new(policy);
605 for (i, root) in roots.iter().enumerate() {
606 if i > 0 {
607 out.newline();
608 }
609 emit_vertex(protocol, schema, grammar, root, &mut out)?;
610 }
611 Ok(out.finish())
612}
613
614fn collect_roots(schema: &Schema) -> Vec<&panproto_gat::Name> {
615 if !schema.entries.is_empty() {
616 return schema
617 .entries
618 .iter()
619 .filter(|name| schema.vertices.contains_key(*name))
620 .collect();
621 }
622
623 // Fallback: every vertex that is not the target of any structural edge
624 // (sorted by id for determinism).
625 let mut targets: std::collections::HashSet<&panproto_gat::Name> =
626 std::collections::HashSet::new();
627 for edge in schema.edges.keys() {
628 targets.insert(&edge.tgt);
629 }
630 let mut roots: Vec<&panproto_gat::Name> = schema
631 .vertices
632 .keys()
633 .filter(|name| !targets.contains(name))
634 .collect();
635 roots.sort();
636 roots
637}
638
639fn emit_vertex(
640 protocol: &str,
641 schema: &Schema,
642 grammar: &Grammar,
643 vertex_id: &panproto_gat::Name,
644 out: &mut Output<'_>,
645) -> Result<(), ParseError> {
646 let vertex = schema
647 .vertices
648 .get(vertex_id)
649 .ok_or_else(|| ParseError::EmitFailed {
650 protocol: protocol.to_owned(),
651 reason: format!("vertex '{vertex_id}' not found"),
652 })?;
653
654 // Leaf shortcut: a vertex carrying a `literal-value` constraint
655 // and no outgoing structural edges is a terminal token. Emit the
656 // captured value directly. This handles identifiers, numeric
657 // literals, and string literals that the parser stored as
658 // `literal-value` even on by-construction schemas.
659 if let Some(literal) = literal_value(schema, vertex_id) {
660 if children_for(schema, vertex_id).is_empty() {
661 out.token(literal);
662 return Ok(());
663 }
664 }
665
666 let kind = vertex.kind.as_ref();
667 let edges = children_for(schema, vertex_id);
668 if let Some(rule) = grammar.rules.get(kind) {
669 let mut cursor = ChildCursor::new(&edges);
670 return emit_production(protocol, schema, grammar, vertex_id, rule, &mut cursor, out);
671 }
672
673 // No rule for this kind. The parser produced it via an ALIAS
674 // (tree-sitter's `alias($.something, $.actual_kind)`) or via an
675 // external scanner (e.g. YAML's `document` root). Fall back to
676 // walking the children directly so the inner content survives;
677 // surrounding tokens — whose only source is the missing rule —
678 // are necessarily absent.
679 for edge in &edges {
680 emit_vertex(protocol, schema, grammar, &edge.tgt, out)?;
681 }
682 Ok(())
683}
684
685/// Linear cursor over a vertex's outgoing edges, used to thread
686/// children through a production rule without double-consuming them.
687struct ChildCursor<'a> {
688 edges: &'a [&'a Edge],
689 consumed: Vec<bool>,
690}
691
692impl<'a> ChildCursor<'a> {
693 fn new(edges: &'a [&'a Edge]) -> Self {
694 Self {
695 edges,
696 consumed: vec![false; edges.len()],
697 }
698 }
699
700 /// Take the next unconsumed edge whose kind equals `field_name`.
701 fn take_field(&mut self, field_name: &str) -> Option<&'a Edge> {
702 for (i, edge) in self.edges.iter().enumerate() {
703 if !self.consumed[i] && edge.kind.as_ref() == field_name {
704 self.consumed[i] = true;
705 return Some(edge);
706 }
707 }
708 None
709 }
710
711 /// Whether any unconsumed edge satisfies `predicate`. Used by the
712 /// unit tests; the live emit path went through `has_matching` on
713 /// each alternative until cursor-driven dispatch was rewritten to
714 /// pick the first-unconsumed-edge's kind directly.
715 #[cfg(test)]
716 fn has_matching(&self, predicate: impl Fn(&Edge) -> bool) -> bool {
717 self.edges
718 .iter()
719 .enumerate()
720 .any(|(i, edge)| !self.consumed[i] && predicate(edge))
721 }
722
723 /// Take the next unconsumed edge whose target vertex satisfies
724 /// `predicate`. Returns the edge and the underlying production
725 /// resolution path is the caller's job.
726 fn take_matching(&mut self, predicate: impl Fn(&Edge) -> bool) -> Option<&'a Edge> {
727 for (i, edge) in self.edges.iter().enumerate() {
728 if !self.consumed[i] && predicate(edge) {
729 self.consumed[i] = true;
730 return Some(edge);
731 }
732 }
733 None
734 }
735}
736
737thread_local! {
738 static EMIT_DEPTH: std::cell::Cell<usize> = const { std::cell::Cell::new(0) };
739 /// Set of `(vertex_id, rule_name)` pairs that are currently being
740 /// walked by the recursion. A SYMBOL that resolves to a rule
741 /// already on this stack closes a μ-binder cycle: in the
742 /// coinductive reading, the rule walk at any vertex is the least
743 /// fixed point of `body[μ X . body / X]`, which unfolds at most
744 /// once, with the second visit returning the empty sequence (the
745 /// unit of the free token monoid). Examples that trigger this:
746 /// YAML's `stream` ⊃ `_b_blk_*` mutually-recursive chain, Rust's
747 /// `_expression` ⊃ `binary_expression` ⊃ `_expression`.
748 static EMIT_MU_FRAMES: std::cell::RefCell<std::collections::HashSet<(String, String)>> =
749 std::cell::RefCell::new(std::collections::HashSet::new());
750 /// The name of the FIELD whose body the walker is currently inside,
751 /// or `None` at top level. Lets a SYMBOL nested arbitrarily deep
752 /// in the field's content (under SEQ, CHOICE, REPEAT, OPTIONAL)
753 /// consume from the *outer* cursor by edge-kind rather than from
754 /// the child's own cursor by symbol-match. Without this, shapes
755 /// like `field('args', commaSep1($.X))` — which expands to
756 /// `FIELD(SEQ(SYMBOL X, REPEAT(SEQ(',', SYMBOL X))))` — emit only
757 /// the first matched edge: the FIELD handler consumed one edge,
758 /// the inner REPEAT searched the consumed child's cursor (which
759 /// has no more sibling field edges), and the REPEAT broke after
760 /// one iteration. Setting the context here so the inner SYMBOL
761 /// pulls successive field-named edges from the outer cursor
762 /// recovers every matched edge across arbitrary nesting.
763 static EMIT_FIELD_CONTEXT: std::cell::RefCell<Option<String>> =
764 const { std::cell::RefCell::new(None) };
765}
766
767/// RAII guard that restores the prior `EMIT_FIELD_CONTEXT` value on drop.
768struct FieldContextGuard(Option<String>);
769
770impl Drop for FieldContextGuard {
771 fn drop(&mut self) {
772 EMIT_FIELD_CONTEXT.with(|f| *f.borrow_mut() = self.0.take());
773 }
774}
775
776fn push_field_context(name: &str) -> FieldContextGuard {
777 let prev = EMIT_FIELD_CONTEXT.with(|f| f.borrow_mut().replace(name.to_owned()));
778 FieldContextGuard(prev)
779}
780
781/// Clear the field context for the duration of a child-context walk.
782/// The child's own production has its own FIELDs that set their own
783/// context; the outer field hint must not leak into them.
784fn clear_field_context() -> FieldContextGuard {
785 let prev = EMIT_FIELD_CONTEXT.with(|f| f.borrow_mut().take());
786 FieldContextGuard(prev)
787}
788
789fn current_field_context() -> Option<String> {
790 EMIT_FIELD_CONTEXT.with(|f| f.borrow().clone())
791}
792
793/// Walk a rule at a vertex inside a μ-binder. The wrapping frame is
794/// pushed before recursion and popped after, so any SYMBOL inside
795/// `rule` that re-enters the same `(vertex_id, rule_name)` pair
796/// returns the empty sequence (μ X . body unfolds once).
797fn walk_in_mu_frame(
798 protocol: &str,
799 schema: &Schema,
800 grammar: &Grammar,
801 vertex_id: &panproto_gat::Name,
802 rule_name: &str,
803 rule: &Production,
804 cursor: &mut ChildCursor<'_>,
805 out: &mut Output<'_>,
806) -> Result<(), ParseError> {
807 let key = (vertex_id.to_string(), rule_name.to_owned());
808 let inserted = EMIT_MU_FRAMES.with(|frames| frames.borrow_mut().insert(key.clone()));
809 if !inserted {
810 // We are already walking this rule at this vertex deeper in
811 // the call stack. The coinductive μ-fixed-point reading
812 // returns the empty sequence here; the surrounding
813 // production resumes after the SYMBOL.
814 return Ok(());
815 }
816 let result = emit_production(protocol, schema, grammar, vertex_id, rule, cursor, out);
817 EMIT_MU_FRAMES.with(|frames| {
818 frames.borrow_mut().remove(&key);
819 });
820 result
821}
822
823fn emit_production(
824 protocol: &str,
825 schema: &Schema,
826 grammar: &Grammar,
827 vertex_id: &panproto_gat::Name,
828 production: &Production,
829 cursor: &mut ChildCursor<'_>,
830 out: &mut Output<'_>,
831) -> Result<(), ParseError> {
832 let depth = EMIT_DEPTH.with(|d| {
833 let v = d.get() + 1;
834 d.set(v);
835 v
836 });
837 if depth > 500 {
838 EMIT_DEPTH.with(|d| d.set(d.get() - 1));
839 return Err(ParseError::EmitFailed {
840 protocol: protocol.to_owned(),
841 reason: format!(
842 "emit_production recursion >500 (likely a cyclic grammar; \
843 vertex='{vertex_id}')"
844 ),
845 });
846 }
847 let result = emit_production_inner(
848 protocol, schema, grammar, vertex_id, production, cursor, out,
849 );
850 EMIT_DEPTH.with(|d| d.set(d.get() - 1));
851 result
852}
853
854fn emit_production_inner(
855 protocol: &str,
856 schema: &Schema,
857 grammar: &Grammar,
858 vertex_id: &panproto_gat::Name,
859 production: &Production,
860 cursor: &mut ChildCursor<'_>,
861 out: &mut Output<'_>,
862) -> Result<(), ParseError> {
863 match production {
864 Production::String { value } => {
865 out.token(value);
866 Ok(())
867 }
868 Production::Pattern { value } => {
869 if let Some(literal) = literal_value(schema, vertex_id) {
870 out.token(literal);
871 } else {
872 out.token(&placeholder_for_pattern(value));
873 }
874 Ok(())
875 }
876 Production::Blank => Ok(()),
877 Production::Symbol { name } => {
878 // Inside a FIELD body, a SYMBOL consumes by field-name on
879 // the outer cursor rather than searching by symbol-match.
880 // This covers the simple `FIELD(SYMBOL X)` case as well as
881 // every nesting under FIELD that contains SYMBOLs (SEQ,
882 // CHOICE, REPEAT, OPTIONAL, ALIAS). Without the override,
883 // shapes like `field('args', commaSep1($.X))` consume one
884 // field edge in the FIELD handler and then the REPEAT
885 // inside SEQ searches the consumed child's cursor — where
886 // no sibling field edges sit — and breaks after one
887 // iteration.
888 if let Some(field) = current_field_context() {
889 if let Some(edge) = cursor.take_field(&field) {
890 return emit_in_child_context(
891 protocol, schema, grammar, &edge.tgt, production, out,
892 );
893 }
894 // No matching field-named edge left on the outer
895 // cursor. Surface nothing; the surrounding REPEAT /
896 // OPTIONAL / CHOICE backtracks the literal tokens it
897 // emitted on this iteration when it sees no progress.
898 return Ok(());
899 }
900 if name.starts_with('_') {
901 // Hidden rule: not a vertex kind on the schema side.
902 // Inline-expand the rule body so its children take
903 // edges from the current cursor, instead of trying to
904 // take a single child edge that "satisfies" the
905 // hidden rule and discarding the rest of the body
906 // (which would drop tokens like `=` and the trailing
907 // value SYMBOL inside e.g. TOML's `_inline_pair`).
908 //
909 // Wrapped in a μ-frame so a hidden rule that
910 // references its own kind cyclically (or another
911 // hidden rule that closes the cycle) unfolds once
912 // and then collapses to the empty sequence at the
913 // second visit, rather than blowing the stack.
914 if let Some(rule) = grammar.rules.get(name) {
915 walk_in_mu_frame(
916 protocol, schema, grammar, vertex_id, name, rule, cursor, out,
917 )
918 } else {
919 // External hidden rule (declared in the
920 // grammar's `externals` block, scanned by C code,
921 // not listed in `rules`). Heuristic fallback by
922 // name:
923 //
924 // - `_indent` / `*_indent`: open an indent block.
925 // Indent-based grammars (Python, YAML, qvr)
926 // declare an `_indent` external scanner before
927 // the body of a block-bodied declaration; the
928 // emitted output is unparseable without the
929 // corresponding indentation jump.
930 // - `_dedent` / `*_dedent`: close the matching
931 // indent block.
932 // - `_newline` / `*_line_ending` / `*_or_eof`:
933 // universally newline-or-empty; emitting a
934 // single newline is the right default for
935 // grammars like TOML whose `pair` SEQ trails
936 // into `_line_ending_or_eof`.
937 //
938 // Anything else falls through silently — better
939 // to drop an unknown external token than to
940 // invent one that confuses re-parsing.
941 if name == "_indent" || name.ends_with("_indent") {
942 out.indent_open();
943 } else if name == "_dedent" || name.ends_with("_dedent") {
944 out.indent_close();
945 } else if name.contains("line_ending")
946 || name.contains("newline")
947 || name.ends_with("_or_eof")
948 {
949 out.newline();
950 }
951 Ok(())
952 }
953 } else if let Some(edge) = take_symbol_match(grammar, schema, cursor, name) {
954 // For supertype / hidden-rule dispatch the child's
955 // own kind names the actual production to walk
956 // (`child.kind` IS the subtype). For ALIAS the
957 // dependent-optic context is carried by the
958 // surrounding `Production::Alias` branch, which calls
959 // `emit_aliased_child` directly; we don't reach here
960 // for that case. So walking `grammar.rules[child.kind]`
961 // via `emit_vertex` is correct: the dependent-optic
962 // path is preserved at every site where it actually
963 // diverges from `child.kind`.
964 emit_vertex(protocol, schema, grammar, &edge.tgt, out)
965 } else if vertex_id_kind(schema, vertex_id) == Some(name.as_str()) {
966 let rule = grammar
967 .rules
968 .get(name)
969 .ok_or_else(|| ParseError::EmitFailed {
970 protocol: protocol.to_owned(),
971 reason: format!("no production for SYMBOL '{name}'"),
972 })?;
973 // Self-reference (`X = ... SYMBOL X ...`): wrap in a
974 // μ-frame so re-entry collapses to the empty sequence.
975 walk_in_mu_frame(
976 protocol, schema, grammar, vertex_id, name, rule, cursor, out,
977 )
978 } else {
979 // Named rule with no matching child: emit nothing and
980 // let the surrounding CHOICE / OPTIONAL / REPEAT
981 // resolve the absence.
982 Ok(())
983 }
984 }
985 Production::Seq { members } => {
986 for member in members {
987 emit_production(protocol, schema, grammar, vertex_id, member, cursor, out)?;
988 }
989 Ok(())
990 }
991 Production::Choice { members } => {
992 if let Some(matched) =
993 pick_choice_with_cursor(schema, grammar, vertex_id, cursor, members)
994 {
995 emit_production(protocol, schema, grammar, vertex_id, matched, cursor, out)
996 } else {
997 Ok(())
998 }
999 }
1000 Production::Repeat { content } | Production::Repeat1 { content } => {
1001 let mut emitted_any = false;
1002 loop {
1003 let cursor_snap = cursor.consumed.clone();
1004 let out_snap = out.snapshot();
1005 let consumed_before = cursor.consumed.iter().filter(|&&c| c).count();
1006 let result =
1007 emit_production(protocol, schema, grammar, vertex_id, content, cursor, out);
1008 let consumed_after = cursor.consumed.iter().filter(|&&c| c).count();
1009 if result.is_err() || consumed_after == consumed_before {
1010 cursor.consumed = cursor_snap;
1011 out.restore(out_snap);
1012 break;
1013 }
1014 emitted_any = true;
1015 }
1016 if matches!(production, Production::Repeat1 { .. }) && !emitted_any {
1017 emit_production(protocol, schema, grammar, vertex_id, content, cursor, out)?;
1018 }
1019 Ok(())
1020 }
1021 Production::Optional { content } => {
1022 let cursor_snap = cursor.consumed.clone();
1023 let out_snap = out.snapshot();
1024 let consumed_before = cursor.consumed.iter().filter(|&&c| c).count();
1025 let result =
1026 emit_production(protocol, schema, grammar, vertex_id, content, cursor, out);
1027 // OPTIONAL is a backtracking site: if the inner production
1028 // errored *or* made no progress without leaving a witness
1029 // constraint, restore both cursor and output to their
1030 // pre-attempt state. Mirrors `Repeat`'s loop body.
1031 if result.is_err() {
1032 cursor.consumed = cursor_snap;
1033 out.restore(out_snap);
1034 return result;
1035 }
1036 let consumed_after = cursor.consumed.iter().filter(|&&c| c).count();
1037 if consumed_after == consumed_before
1038 && !has_relevant_constraint(content, schema, vertex_id)
1039 {
1040 cursor.consumed = cursor_snap;
1041 out.restore(out_snap);
1042 }
1043 Ok(())
1044 }
1045 Production::Field { name, content } => {
1046 // Set the field context for the duration of `content`'s
1047 // walk and emit the content against the *outer* cursor.
1048 // The SYMBOL handler picks up the context and pulls
1049 // successive `take_field(name)` edges as it encounters
1050 // SYMBOLs anywhere under `content` (under SEQ, CHOICE,
1051 // REPEAT, OPTIONAL, ALIAS — arbitrarily nested). This
1052 // subsumes the prior carve-outs for FIELD(REPEAT(...)),
1053 // FIELD(REPEAT1(...)), and the bare FIELD(SYMBOL ...)
1054 // case, and adds coverage for
1055 // `field('xs', commaSep1($.X))` which expands to
1056 // FIELD(SEQ(SYMBOL X, REPEAT(SEQ(',', SYMBOL X)))) and
1057 // any other shape where REPEAT/REPEAT1 sits inside SEQ /
1058 // CHOICE / OPTIONAL under a FIELD. A FIELD that wraps a
1059 // non-SYMBOL production (e.g. `field('op', '+')` or
1060 // `field('op', CHOICE(STRING ...))`) still works: STRING
1061 // handlers ignore the context and emit literals
1062 // directly, so the operator token survives the round
1063 // trip.
1064 let _guard = push_field_context(name);
1065 emit_production(protocol, schema, grammar, vertex_id, content, cursor, out)
1066 }
1067 Production::Alias {
1068 content,
1069 named,
1070 value,
1071 } => {
1072 // A named ALIAS rewrites the parser-visible kind to
1073 // `value`. If the cursor has an unconsumed child whose
1074 // kind matches that alias name, take it and emit the
1075 // child using the alias's INNER content as the rule
1076 // (e.g. `ALIAS { SYMBOL real_rule, value: "kind_x" }`
1077 // means a `kind_x` vertex on the schema should be walked
1078 // through `real_rule`'s body, not through whatever rule
1079 // happens to be keyed under `kind_x`). This is the
1080 // dependent-optic shape: the rule the emitter walks at a
1081 // child position is determined by the parent's chosen
1082 // alias, not by the child kind alone — without it,
1083 // grammars like YAML that introduce the same kind through
1084 // many ALIAS sites lose the parent context the moment
1085 // emit_vertex is called.
1086 if *named && !value.is_empty() {
1087 if let Some(edge) = cursor.take_matching(|edge| {
1088 schema
1089 .vertices
1090 .get(&edge.tgt)
1091 .map(|v| v.kind.as_ref() == value.as_str())
1092 .unwrap_or(false)
1093 }) {
1094 return emit_aliased_child(protocol, schema, grammar, &edge.tgt, content, out);
1095 }
1096 }
1097 emit_production(protocol, schema, grammar, vertex_id, content, cursor, out)
1098 }
1099 Production::Token { content }
1100 | Production::ImmediateToken { content }
1101 | Production::Prec { content, .. }
1102 | Production::PrecLeft { content, .. }
1103 | Production::PrecRight { content, .. }
1104 | Production::PrecDynamic { content, .. }
1105 | Production::Reserved { content, .. } => {
1106 emit_production(protocol, schema, grammar, vertex_id, content, cursor, out)
1107 }
1108 }
1109}
1110
1111/// Take the next cursor edge whose target vertex's kind matches the
1112/// SYMBOL `name` directly or via inline expansion of a hidden rule.
1113fn take_symbol_match<'a>(
1114 grammar: &Grammar,
1115 schema: &Schema,
1116 cursor: &mut ChildCursor<'a>,
1117 name: &str,
1118) -> Option<&'a Edge> {
1119 cursor.take_matching(|edge| {
1120 let target_kind = schema.vertices.get(&edge.tgt).map(|v| v.kind.as_ref());
1121 kind_satisfies_symbol(grammar, target_kind, name)
1122 })
1123}
1124
1125/// Decide whether a schema vertex of kind `target_kind` satisfies a
1126/// SYMBOL `name` reference in the grammar.
1127///
1128/// Operates as an O(1) lookup against the precomputed subtype
1129/// closure built at [`Grammar::from_bytes`]. The semantic content is
1130/// "K satisfies SYMBOL S iff K is reachable from S by walking the
1131/// grammar's hidden, supertype, and named-alias dispatch": this is
1132/// exactly the relation tree-sitter induces on `(parser-visible kind,
1133/// rule-position)` pairs.
1134fn kind_satisfies_symbol(grammar: &Grammar, target_kind: Option<&str>, name: &str) -> bool {
1135 let Some(target) = target_kind else {
1136 return false;
1137 };
1138 if target == name {
1139 return true;
1140 }
1141 grammar
1142 .subtypes
1143 .get(target)
1144 .is_some_and(|set| set.contains(name))
1145}
1146
1147/// Emit a child reached through an ALIAS production using the
1148/// alias's inner content as the rule, not `grammar.rules[child.kind]`.
1149///
1150/// This carries the dependent-optic context across the ALIAS edge:
1151/// at the parent rule's site we know which underlying production the
1152/// alias wraps (typically `SYMBOL real_rule`), and that's the
1153/// production that should drive the emit walk on the child's
1154/// children. Looking up `grammar.rules.get(child.kind)` instead would
1155/// either fail (the renamed kind has no top-level rule, e.g. YAML's
1156/// `block_mapping_pair`) or pick an arbitrary same-kinded rule from
1157/// elsewhere in the grammar.
1158///
1159/// Walk-context invariant. The dependent-optic shape of `emit_pretty`
1160/// says: the production walked at any vertex is determined by the
1161/// path from the root through the grammar, not by the vertex kind in
1162/// isolation. Two dispatch sites realise that invariant:
1163///
1164/// * [`emit_vertex`] looks up `grammar.rules[child.kind]` and walks
1165/// it. Correct for supertype / hidden-rule dispatch: the child's
1166/// kind on the schema IS the subtype tree-sitter selected, so its
1167/// top-level rule is the right production to walk.
1168/// * `emit_aliased_child` threads the parent rule's `Production`
1169/// directly (the inner `content` of `Production::Alias`) and walks
1170/// it on the child's children. Correct for ALIAS dispatch: the
1171/// child's kind on the schema is the alias's `value` (a renamed
1172/// kind that may have no top-level rule), and the production to
1173/// walk is the alias's content body, supplied by the parent.
1174///
1175/// Together these cover every site where the rule-walked-at-child
1176/// diverges from `grammar.rules[child.kind]`; the recursion site for
1177/// plain SYMBOL therefore correctly delegates to `emit_vertex`, and
1178/// we do not need a richer `WalkContext` value passed by reference.
1179/// The grammar dependency is the thread.
1180fn emit_aliased_child(
1181 protocol: &str,
1182 schema: &Schema,
1183 grammar: &Grammar,
1184 child_id: &panproto_gat::Name,
1185 content: &Production,
1186 out: &mut Output<'_>,
1187) -> Result<(), ParseError> {
1188 // Leaf shortcut: if the child has a literal-value and no
1189 // structural children, emit the captured text. Identifiers and
1190 // similar terminals reach here when an ALIAS wraps a SYMBOL that
1191 // resolves to a PATTERN.
1192 if let Some(literal) = literal_value(schema, child_id) {
1193 if children_for(schema, child_id).is_empty() {
1194 out.token(literal);
1195 return Ok(());
1196 }
1197 }
1198
1199 // Resolve `content` to a rule when it's a SYMBOL (the dominant
1200 // shape: `ALIAS { content: SYMBOL real_rule, value: "kind_x" }`).
1201 if let Production::Symbol { name } = content {
1202 if let Some(rule) = grammar.rules.get(name) {
1203 let edges = children_for(schema, child_id);
1204 let mut cursor = ChildCursor::new(&edges);
1205 return emit_production(protocol, schema, grammar, child_id, rule, &mut cursor, out);
1206 }
1207 }
1208
1209 // Other ALIAS contents (CHOICE, SEQ, literals) walk in place.
1210 let edges = children_for(schema, child_id);
1211 let mut cursor = ChildCursor::new(&edges);
1212 emit_production(
1213 protocol,
1214 schema,
1215 grammar,
1216 child_id,
1217 content,
1218 &mut cursor,
1219 out,
1220 )
1221}
1222
1223fn emit_in_child_context(
1224 protocol: &str,
1225 schema: &Schema,
1226 grammar: &Grammar,
1227 child_id: &panproto_gat::Name,
1228 production: &Production,
1229 out: &mut Output<'_>,
1230) -> Result<(), ParseError> {
1231 // The child walks under its own production tree, with its own
1232 // FIELDs setting their own contexts. Clear the outer FIELD hint
1233 // so it does not leak through and cause sibling SYMBOLs inside
1234 // the child's body to mistakenly pull edges from the child's
1235 // cursor by the parent's field name.
1236 let _guard = clear_field_context();
1237 // If `production` is a structural wrapper (CHOICE / SEQ /
1238 // OPTIONAL / ...) whose referenced symbols cover the child's own
1239 // kind, the child IS the production's target node and the right
1240 // emit path is `emit_vertex(child)` (which honours the
1241 // literal-value leaf shortcut). Without this guard, FIELD(pattern,
1242 // CHOICE { _pattern, self }) on an identifier child walks the
1243 // CHOICE on the identifier's empty cursor, falls through to the
1244 // first non-BLANK alt, and loses the captured identifier text.
1245 if !matches!(production, Production::Symbol { .. }) {
1246 let child_kind = schema.vertices.get(child_id).map(|v| v.kind.as_ref());
1247 let symbols = referenced_symbols(production);
1248 if symbols
1249 .iter()
1250 .any(|s| kind_satisfies_symbol(grammar, child_kind, s) || child_kind == Some(s))
1251 {
1252 return emit_vertex(protocol, schema, grammar, child_id, out);
1253 }
1254 }
1255 match production {
1256 Production::Symbol { .. } => emit_vertex(protocol, schema, grammar, child_id, out),
1257 _ => {
1258 let edges = children_for(schema, child_id);
1259 let mut cursor = ChildCursor::new(&edges);
1260 emit_production(
1261 protocol,
1262 schema,
1263 grammar,
1264 child_id,
1265 production,
1266 &mut cursor,
1267 out,
1268 )
1269 }
1270 }
1271}
1272
1273fn pick_choice_with_cursor<'a>(
1274 schema: &Schema,
1275 grammar: &Grammar,
1276 vertex_id: &panproto_gat::Name,
1277 cursor: &ChildCursor<'_>,
1278 alternatives: &'a [Production],
1279) -> Option<&'a Production> {
1280 // Discriminator-driven dispatch (highest priority): when the
1281 // walker recorded a `chose-alt-fingerprint` constraint at parse
1282 // time, dispatch directly against that. This is the categorical
1283 // discriminator: it survives stripping of byte-position
1284 // constraints (so by-construction round-trips work) and is the
1285 // explicit witness of which CHOICE alternative the parser took.
1286 //
1287 // Falls back to the live `interstitial-*` substring blob when no
1288 // fingerprint is present (e.g. instances built by callers that
1289 // bypass the AstWalker). Both blobs are scored by the longest
1290 // STRING-literal token in an alternative that matches; the
1291 // length tiebreak prefers `&&` over `&`, `==` over `=`, etc.
1292 let constraint_blob = schema
1293 .constraints
1294 .get(vertex_id)
1295 .map(|cs| {
1296 let fingerprint: Option<&str> = cs
1297 .iter()
1298 .find(|c| c.sort.as_ref() == "chose-alt-fingerprint")
1299 .map(|c| c.value.as_str());
1300 if let Some(fp) = fingerprint {
1301 fp.to_owned()
1302 } else {
1303 cs.iter()
1304 .filter(|c| {
1305 let s = c.sort.as_ref();
1306 s.starts_with("interstitial-") && !s.ends_with("-start-byte")
1307 })
1308 .map(|c| c.value.as_str())
1309 .collect::<Vec<&str>>()
1310 .join(" ")
1311 }
1312 })
1313 .unwrap_or_default();
1314 let child_kinds: Vec<&str> = schema
1315 .constraints
1316 .get(vertex_id)
1317 .and_then(|cs| {
1318 cs.iter()
1319 .find(|c| c.sort.as_ref() == "chose-alt-child-kinds")
1320 .map(|c| c.value.split_whitespace().collect())
1321 })
1322 .unwrap_or_default();
1323 if !constraint_blob.is_empty() {
1324 // Primary score: literal-token match length. This dominates
1325 // alt selection so existing language tests that depend on
1326 // literal-only fingerprints keep working.
1327 // Secondary score (tiebreaker only): named-symbol kind match
1328 // count, read from the separate `chose-alt-child-kinds`
1329 // constraint (kept apart from the literal fingerprint so
1330 // identifiers like `:` in the kind list don't contaminate the
1331 // literal match). An alt that matches the recorded kinds is a
1332 // stronger witness than one whose only
1333 // overlap is literal punctuation.
1334 let mut best_literal: usize = 0;
1335 let mut best_symbols: usize = 0;
1336 let mut best_alt: Option<&Production> = None;
1337 let mut tied = false;
1338 for alt in alternatives {
1339 let strings = literal_strings(alt);
1340 if strings.is_empty() {
1341 continue;
1342 }
1343 let literal_score = strings
1344 .iter()
1345 .filter(|s| constraint_blob.contains(s.as_str()))
1346 .map(String::len)
1347 .sum::<usize>();
1348 if literal_score == 0 {
1349 continue;
1350 }
1351 // Symbol score is computed only as a tiebreaker among alts
1352 // whose literal-token coverage is the same; it never lifts
1353 // an alt above one with a strictly higher literal score.
1354 // Reads the `chose-alt-child-kinds` constraint (a separate
1355 // sequence the walker emits, kept apart from the literal
1356 // fingerprint to avoid cross-contamination).
1357 let symbol_score = if literal_score >= best_literal && !child_kinds.is_empty() {
1358 let symbols = referenced_symbols(alt);
1359 symbols
1360 .iter()
1361 .filter(|sym| {
1362 let sym_str: &str = sym;
1363 if child_kinds.contains(&sym_str) {
1364 return true;
1365 }
1366 grammar.subtypes.get(sym_str).is_some_and(|sub_set| {
1367 sub_set
1368 .iter()
1369 .any(|sub| child_kinds.contains(&sub.as_str()))
1370 })
1371 })
1372 .count()
1373 } else {
1374 0
1375 };
1376 let better = literal_score > best_literal
1377 || (literal_score == best_literal && symbol_score > best_symbols);
1378 let same = literal_score == best_literal && symbol_score == best_symbols;
1379 if better {
1380 best_literal = literal_score;
1381 best_symbols = symbol_score;
1382 best_alt = Some(alt);
1383 tied = false;
1384 } else if same && best_alt.is_some() {
1385 tied = true;
1386 }
1387 }
1388 // Only commit to an alt when the fingerprint discriminates it
1389 // uniquely. A tie means the alts share the same literal token
1390 // set (e.g. JSON's `string = CHOICE { SEQ { '"', '"' }, SEQ {
1391 // '"', _string_content, '"' } }` — both alts contain just the
1392 // two `"` tokens). In that case fall through to cursor-based
1393 // dispatch, which uses the actual edge structure.
1394 if let Some(alt) = best_alt {
1395 if !tied {
1396 return Some(alt);
1397 }
1398 }
1399 }
1400
1401 // Cursor-driven dispatch: pick the alternative whose body
1402 // references a SYMBOL covering the *first unconsumed* edge in
1403 // cursor order. `referenced_symbols` walks the alternative
1404 // recursively (across nested SEQs, REPEATs, OPTIONALs, FIELDs,
1405 // etc.) so a leading optional like `attribute_item` does not
1406 // block matching when only the trailing required symbol is
1407 // present on the schema.
1408 //
1409 // Ordering by the first unconsumed edge (rather than picking any
1410 // alternative whose SYMBOL set intersects the unconsumed
1411 // multiset) is what preserves schema edge order under
1412 // REPEAT(CHOICE(...)) productions. Without this rule, alt order
1413 // in the grammar's CHOICE determines the emission order, and a
1414 // schema with interleaved kinds like `[symbol, punct, int,
1415 // symbol, punct, int]` re-fuses to `[symbol, symbol, punct,
1416 // punct, int, int]` when emitted then re-parsed. The fix is the
1417 // categorical reading of REPEAT-over-list (list-shaped fold)
1418 // rather than REPEAT-over-multiset (unordered fold).
1419 let first_unconsumed_kind: Option<&str> = cursor
1420 .edges
1421 .iter()
1422 .enumerate()
1423 .find(|(i, _)| !cursor.consumed[*i])
1424 .and_then(|(_, edge)| schema.vertices.get(&edge.tgt).map(|v| v.kind.as_ref()));
1425 if let Some(target_kind) = first_unconsumed_kind {
1426 for alt in alternatives {
1427 let symbols = referenced_symbols(alt);
1428 if !symbols.is_empty()
1429 && symbols
1430 .iter()
1431 .any(|s| kind_satisfies_symbol(grammar, Some(target_kind), s))
1432 {
1433 return Some(alt);
1434 }
1435 }
1436 }
1437
1438 // FIELD dispatch: pick an alternative whose FIELD name matches an
1439 // unconsumed edge kind.
1440 let edge_kinds: Vec<&str> = cursor
1441 .edges
1442 .iter()
1443 .enumerate()
1444 .filter(|(i, _)| !cursor.consumed[*i])
1445 .map(|(_, e)| e.kind.as_ref())
1446 .collect();
1447 for alt in alternatives {
1448 if has_field_in(alt, &edge_kinds) {
1449 return Some(alt);
1450 }
1451 }
1452
1453 // No cursor-driven match. Fall back to:
1454 //
1455 // - BLANK (the explicit empty alternative) when present, so an
1456 // OPTIONAL-shaped CHOICE compiles to nothing.
1457 // - The first non-`BLANK` alternative as a last resort, used by
1458 // STRING-only alternatives (keyword tokens) and other choices
1459 // that don't reach the cursor.
1460 //
1461 // The previous "match own_kind" branch is intentionally absent:
1462 // when an alt's first SYMBOL equals the current vertex's kind, the
1463 // caller is already emitting that vertex's own rule. Recursing
1464 // into the alt would cause a self-loop in the rule walk.
1465 let _ = (schema, vertex_id);
1466 if alternatives.iter().any(|a| matches!(a, Production::Blank)) {
1467 return alternatives.iter().find(|a| matches!(a, Production::Blank));
1468 }
1469 alternatives
1470 .iter()
1471 .find(|alt| !matches!(alt, Production::Blank))
1472}
1473
1474/// Collect every literal STRING token directly inside `production`
1475/// (without descending into SYMBOLs / hidden rules). Used to score
1476/// CHOICE alternatives against the parent vertex's interstitials so
1477/// the right operator / keyword form is picked when the schema
1478/// preserves interstitial fragments from a prior parse.
1479fn literal_strings(production: &Production) -> Vec<String> {
1480 let mut out = Vec::new();
1481 fn walk(p: &Production, out: &mut Vec<String>) {
1482 match p {
1483 Production::String { value } if !value.is_empty() => {
1484 out.push(value.clone());
1485 }
1486 Production::Choice { members } | Production::Seq { members } => {
1487 for m in members {
1488 walk(m, out);
1489 }
1490 }
1491 Production::Repeat { content }
1492 | Production::Repeat1 { content }
1493 | Production::Optional { content }
1494 | Production::Field { content, .. }
1495 | Production::Alias { content, .. }
1496 | Production::Token { content }
1497 | Production::ImmediateToken { content }
1498 | Production::Prec { content, .. }
1499 | Production::PrecLeft { content, .. }
1500 | Production::PrecRight { content, .. }
1501 | Production::PrecDynamic { content, .. }
1502 | Production::Reserved { content, .. } => walk(content, out),
1503 _ => {}
1504 }
1505 }
1506 walk(production, &mut out);
1507 out
1508}
1509
1510/// Collect every SYMBOL name reachable from `production` without
1511/// crossing into nested rules. Used by `pick_choice_with_cursor` to
1512/// rank alternatives by "any SYMBOL inside this alt matches something
1513/// on the cursor", instead of just the first SYMBOL: a leading
1514/// optional like `attribute_item` then `parameter` is otherwise
1515/// rejected when only the parameter children are present.
1516fn referenced_symbols(production: &Production) -> Vec<&str> {
1517 let mut out = Vec::new();
1518 fn walk<'a>(p: &'a Production, out: &mut Vec<&'a str>) {
1519 match p {
1520 Production::Symbol { name } => out.push(name.as_str()),
1521 Production::Choice { members } | Production::Seq { members } => {
1522 for m in members {
1523 walk(m, out);
1524 }
1525 }
1526 Production::Repeat { content }
1527 | Production::Repeat1 { content }
1528 | Production::Optional { content }
1529 | Production::Field { content, .. }
1530 | Production::Alias { content, .. }
1531 | Production::Token { content }
1532 | Production::ImmediateToken { content }
1533 | Production::Prec { content, .. }
1534 | Production::PrecLeft { content, .. }
1535 | Production::PrecRight { content, .. }
1536 | Production::PrecDynamic { content, .. }
1537 | Production::Reserved { content, .. } => walk(content, out),
1538 _ => {}
1539 }
1540 }
1541 walk(production, &mut out);
1542 out
1543}
1544
1545#[cfg(test)]
1546fn first_symbol(production: &Production) -> Option<&str> {
1547 match production {
1548 Production::Symbol { name } => Some(name),
1549 Production::Seq { members } => members.iter().find_map(first_symbol),
1550 Production::Choice { members } => members.iter().find_map(first_symbol),
1551 Production::Repeat { content }
1552 | Production::Repeat1 { content }
1553 | Production::Optional { content }
1554 | Production::Field { content, .. }
1555 | Production::Alias { content, .. }
1556 | Production::Token { content }
1557 | Production::ImmediateToken { content }
1558 | Production::Prec { content, .. }
1559 | Production::PrecLeft { content, .. }
1560 | Production::PrecRight { content, .. }
1561 | Production::PrecDynamic { content, .. }
1562 | Production::Reserved { content, .. } => first_symbol(content),
1563 _ => None,
1564 }
1565}
1566
1567fn has_field_in(production: &Production, edge_kinds: &[&str]) -> bool {
1568 match production {
1569 Production::Field { name, .. } => edge_kinds.contains(&name.as_str()),
1570 Production::Seq { members } | Production::Choice { members } => {
1571 members.iter().any(|m| has_field_in(m, edge_kinds))
1572 }
1573 Production::Repeat { content }
1574 | Production::Repeat1 { content }
1575 | Production::Optional { content }
1576 | Production::Alias { content, .. }
1577 | Production::Token { content }
1578 | Production::ImmediateToken { content }
1579 | Production::Prec { content, .. }
1580 | Production::PrecLeft { content, .. }
1581 | Production::PrecRight { content, .. }
1582 | Production::PrecDynamic { content, .. }
1583 | Production::Reserved { content, .. } => has_field_in(content, edge_kinds),
1584 _ => false,
1585 }
1586}
1587
1588fn has_relevant_constraint(
1589 production: &Production,
1590 schema: &Schema,
1591 vertex_id: &panproto_gat::Name,
1592) -> bool {
1593 let constraints = match schema.constraints.get(vertex_id) {
1594 Some(c) => c,
1595 None => return false,
1596 };
1597 fn walk(production: &Production, constraints: &[panproto_schema::Constraint]) -> bool {
1598 match production {
1599 Production::String { value } => constraints
1600 .iter()
1601 .any(|c| c.value == *value || c.sort.as_ref() == value),
1602 Production::Field { name, content } => {
1603 constraints.iter().any(|c| c.sort.as_ref() == name) || walk(content, constraints)
1604 }
1605 Production::Seq { members } | Production::Choice { members } => {
1606 members.iter().any(|m| walk(m, constraints))
1607 }
1608 Production::Repeat { content }
1609 | Production::Repeat1 { content }
1610 | Production::Optional { content }
1611 | Production::Alias { content, .. }
1612 | Production::Token { content }
1613 | Production::ImmediateToken { content }
1614 | Production::Prec { content, .. }
1615 | Production::PrecLeft { content, .. }
1616 | Production::PrecRight { content, .. }
1617 | Production::PrecDynamic { content, .. }
1618 | Production::Reserved { content, .. } => walk(content, constraints),
1619 _ => false,
1620 }
1621 }
1622 walk(production, constraints)
1623}
1624
1625fn children_for<'a>(schema: &'a Schema, vertex_id: &panproto_gat::Name) -> Vec<&'a Edge> {
1626 // Walk `outgoing` (insertion-ordered by SchemaBuilder via SmallVec
1627 // append) rather than the unordered `edges` HashMap so abstract
1628 // schemas under REPEAT(CHOICE(...)) preserve the order their edges
1629 // were inserted in. The previous implementation walked the HashMap
1630 // and sorted lexicographically by (kind, target id), which fused
1631 // interleaved children of the same kind into runs (e.g. a sequence
1632 // [symbol, punct, int, symbol, punct, int] became [symbol, symbol,
1633 // punct, punct, int, int] after the lex sort).
1634 let Some(edges) = schema.outgoing.get(vertex_id) else {
1635 return Vec::new();
1636 };
1637
1638 // Look up the canonical Edge reference (the key in `schema.edges`)
1639 // for each entry in `outgoing`. Falls back to the SmallVec entry if
1640 // the canonical key is missing, which would indicate index drift.
1641 let mut indexed: Vec<(usize, u32, &Edge)> = edges
1642 .iter()
1643 .enumerate()
1644 .map(|(i, e)| {
1645 let canonical = schema.edges.get_key_value(e).map_or(e, |(k, _)| k);
1646 let pos = schema.orderings.get(canonical).copied().unwrap_or(u32::MAX);
1647 (i, pos, canonical)
1648 })
1649 .collect();
1650
1651 // Stable sort by (explicit-ordering, insertion-index). Edges with
1652 // an explicit `orderings` entry come first in their declared order;
1653 // the remainder fall through in insertion order.
1654 indexed.sort_by_key(|(i, pos, _)| (*pos, *i));
1655 indexed.into_iter().map(|(_, _, e)| e).collect()
1656}
1657
1658fn vertex_id_kind<'a>(schema: &'a Schema, vertex_id: &panproto_gat::Name) -> Option<&'a str> {
1659 schema.vertices.get(vertex_id).map(|v| v.kind.as_ref())
1660}
1661
1662fn literal_value<'a>(schema: &'a Schema, vertex_id: &panproto_gat::Name) -> Option<&'a str> {
1663 schema
1664 .constraints
1665 .get(vertex_id)?
1666 .iter()
1667 .find(|c| c.sort.as_ref() == "literal-value")
1668 .map(|c| c.value.as_str())
1669}
1670
1671fn placeholder_for_pattern(pattern: &str) -> String {
1672 // Heuristic placeholder for unconstrained PATTERN terminals.
1673 //
1674 // First handle the "the regex IS a literal escape" cases that
1675 // tree-sitter grammars use as separators (`\n`, `\r\n`, `;`,
1676 // etc.); emitting the matching character is always preferable
1677 // to a `_x` identifier-like placeholder when the surrounding
1678 // grammar expects a separator.
1679 let simple_lit = decode_simple_pattern_literal(pattern);
1680 if let Some(lit) = simple_lit {
1681 return lit;
1682 }
1683
1684 if pattern.contains("[0-9]") || pattern.contains("\\d") {
1685 "0".into()
1686 } else if pattern.contains("[a-zA-Z_]") || pattern.contains("\\w") {
1687 "_x".into()
1688 } else if pattern.contains('"') || pattern.contains('\'') {
1689 "\"\"".into()
1690 } else {
1691 "_".into()
1692 }
1693}
1694
1695/// Decode a tree-sitter PATTERN whose regex is a simple literal
1696/// (newline, semicolon, comma, etc.) to the byte sequence it matches.
1697/// Returns `None` for patterns with character classes, alternations,
1698/// or quantifiers; the caller falls back to the heuristic placeholder.
1699fn decode_simple_pattern_literal(pattern: &str) -> Option<String> {
1700 // Skip patterns containing regex metachars that would broaden the
1701 // match beyond a single literal byte sequence.
1702 if pattern
1703 .chars()
1704 .any(|c| matches!(c, '[' | ']' | '(' | ')' | '*' | '+' | '?' | '|' | '{' | '}'))
1705 {
1706 return None;
1707 }
1708 let mut out = String::new();
1709 let mut chars = pattern.chars();
1710 while let Some(c) = chars.next() {
1711 if c == '\\' {
1712 match chars.next() {
1713 Some('n') => out.push('\n'),
1714 Some('r') => out.push('\r'),
1715 Some('t') => out.push('\t'),
1716 Some('\\') => out.push('\\'),
1717 Some('/') => out.push('/'),
1718 Some(other) => out.push(other),
1719 None => return None,
1720 }
1721 } else {
1722 out.push(c);
1723 }
1724 }
1725 Some(out)
1726}
1727
1728// ═══════════════════════════════════════════════════════════════════
1729// Token list output with Spacing algebra
1730// ═══════════════════════════════════════════════════════════════════
1731//
1732// Emit produces a free monoid over `Token`. Layout (spaces, newlines,
1733// indentation) is a homomorphism `Vec<Token> -> Vec<u8>` parameterised
1734// by `FormatPolicy`. Separating the structural output from the layout
1735// decision means each phase has one job: emit walks the grammar and
1736// pushes tokens; layout is a single fold, locally driven by adjacent
1737// pairs and a depth counter. Snapshot/restore is just `tokens.len()`.
1738
1739#[derive(Clone)]
1740enum Token {
1741 /// A user-visible terminal contributed by the grammar.
1742 Lit(String),
1743 /// `indent_open` marker emitted when a `Lit` matched the policy's
1744 /// open list. Carried as a separate token so layout can decide to
1745 /// break + indent without re-scanning.
1746 IndentOpen,
1747 /// `indent_close` marker emitted before a closer-`Lit`.
1748 IndentClose,
1749 /// "Break a line here if not already at line start" — used after
1750 /// statements/declarations and after open braces.
1751 LineBreak,
1752}
1753
1754struct Output<'a> {
1755 tokens: Vec<Token>,
1756 policy: &'a FormatPolicy,
1757}
1758
1759#[derive(Clone)]
1760struct OutputSnapshot {
1761 tokens_len: usize,
1762}
1763
1764impl<'a> Output<'a> {
1765 fn new(policy: &'a FormatPolicy) -> Self {
1766 Self {
1767 tokens: Vec::new(),
1768 policy,
1769 }
1770 }
1771
1772 fn token(&mut self, value: &str) {
1773 if value.is_empty() {
1774 return;
1775 }
1776
1777 if self.policy.indent_close.iter().any(|t| t == value) {
1778 self.tokens.push(Token::IndentClose);
1779 }
1780
1781 self.tokens.push(Token::Lit(value.to_owned()));
1782
1783 if self.policy.indent_open.iter().any(|t| t == value) {
1784 self.tokens.push(Token::IndentOpen);
1785 self.tokens.push(Token::LineBreak);
1786 } else if self.policy.line_break_after.iter().any(|t| t == value) {
1787 self.tokens.push(Token::LineBreak);
1788 }
1789 }
1790
1791 fn newline(&mut self) {
1792 self.tokens.push(Token::LineBreak);
1793 }
1794
1795 /// Open an indent scope: subsequent `LineBreak`s render at the
1796 /// new depth until a matching `indent_close` pops it. Used by the
1797 /// external-token fallback to render indent-based grammars'
1798 /// `_indent` scanner outputs.
1799 fn indent_open(&mut self) {
1800 self.tokens.push(Token::IndentOpen);
1801 self.tokens.push(Token::LineBreak);
1802 }
1803
1804 /// Close one indent scope opened by `indent_open`.
1805 fn indent_close(&mut self) {
1806 self.tokens.push(Token::IndentClose);
1807 }
1808
1809 fn snapshot(&self) -> OutputSnapshot {
1810 OutputSnapshot {
1811 tokens_len: self.tokens.len(),
1812 }
1813 }
1814
1815 fn restore(&mut self, snap: OutputSnapshot) {
1816 self.tokens.truncate(snap.tokens_len);
1817 }
1818
1819 fn finish(self) -> Vec<u8> {
1820 layout(&self.tokens, self.policy)
1821 }
1822}
1823
1824/// Fold a token list into bytes. The algebra:
1825/// * adjacent `Lit`s get a single space iff `needs_space_between(a, b)`,
1826/// * `IndentOpen` / `IndentClose` adjust a depth counter,
1827/// * `LineBreak` writes `\n` if not already at line start, then the
1828/// next `Lit` writes `indent * indent_width` spaces of indent.
1829fn layout(tokens: &[Token], policy: &FormatPolicy) -> Vec<u8> {
1830 let mut bytes = Vec::new();
1831 let mut indent: usize = 0;
1832 let mut at_line_start = true;
1833 let mut last_lit: Option<&str> = None;
1834 // True iff, at the moment `last_lit` was emitted, the cursor was at a
1835 // position where the grammar expects an operand: start of stream / line,
1836 // just after an open paren / bracket / brace, just after a separator like
1837 // `,` or `;`, or just after a binary / assignment operator. Used by
1838 // `needs_space_between` to recognise `last_lit` as a tight unary prefix
1839 // (`f(-1.0)`) rather than a spaced binary operator (`a - b`).
1840 let mut last_was_in_operand_position = true;
1841 let mut expecting_operand = true;
1842 let newline = policy.newline.as_bytes();
1843 let separator = policy.separator.as_bytes();
1844
1845 for tok in tokens {
1846 match tok {
1847 Token::IndentOpen => indent += 1,
1848 Token::IndentClose => {
1849 indent = indent.saturating_sub(1);
1850 if !at_line_start {
1851 bytes.extend_from_slice(newline);
1852 at_line_start = true;
1853 expecting_operand = true;
1854 }
1855 }
1856 Token::LineBreak => {
1857 if !at_line_start {
1858 bytes.extend_from_slice(newline);
1859 at_line_start = true;
1860 expecting_operand = true;
1861 }
1862 }
1863 Token::Lit(value) => {
1864 if at_line_start {
1865 bytes.extend(std::iter::repeat_n(b' ', indent * policy.indent_width));
1866 } else if let Some(prev) = last_lit {
1867 if needs_space_between(prev, value, last_was_in_operand_position) {
1868 bytes.extend_from_slice(separator);
1869 }
1870 }
1871 bytes.extend_from_slice(value.as_bytes());
1872 at_line_start = false;
1873 last_was_in_operand_position = expecting_operand;
1874 expecting_operand = leaves_operand_position(value);
1875 last_lit = Some(value.as_str());
1876 }
1877 }
1878 }
1879
1880 if !at_line_start {
1881 bytes.extend_from_slice(newline);
1882 }
1883 bytes
1884}
1885
1886/// True iff emitting `tok` leaves the cursor in a position where the
1887/// grammar expects an operand next. Operand-introducing tokens are open
1888/// punctuation, separators, and operator-like strings; operand-terminating
1889/// tokens are identifiers, literals, and closing punctuation.
1890fn leaves_operand_position(tok: &str) -> bool {
1891 if tok.is_empty() {
1892 return true;
1893 }
1894 if is_punct_open(tok) {
1895 return true;
1896 }
1897 if matches!(tok, "," | ";") {
1898 return true;
1899 }
1900 if is_punct_close(tok) {
1901 return false;
1902 }
1903 if first_is_alnum_or_underscore(tok) || last_ends_with_alnum(tok) {
1904 return false;
1905 }
1906 // Pure punctuation/operator runs (`=`, `+`, `-`, `<=`, `>>`, …) leave
1907 // the cursor expecting another operand.
1908 true
1909}
1910
1911fn needs_space_between(last: &str, next: &str, expecting_operand: bool) -> bool {
1912 if last.is_empty() || next.is_empty() {
1913 return false;
1914 }
1915 if is_punct_open(last) || is_punct_open(next) {
1916 return false;
1917 }
1918 if is_punct_close(next) {
1919 return false;
1920 }
1921 if is_punct_close(last) && is_punct_punctuation(next) {
1922 return false;
1923 }
1924 if last == "." || next == "." {
1925 return false;
1926 }
1927 // Tight unary prefix: `last` is a sign/logical-not operator emitted
1928 // where the grammar expected an operand, so it glues to `next`.
1929 // `expecting_operand` here means: just before `last` was emitted,
1930 // the cursor expected an operand, which makes `last` a unary prefix.
1931 // Examples: `f(-1.0)`, `[ -2, 3 ]`, `return -x`, `a = !flag`.
1932 if expecting_operand && is_unary_prefix_operator(last) && first_is_operand_start(next) {
1933 return false;
1934 }
1935 if last_is_word_like(last) && first_is_word_like(next) {
1936 return true;
1937 }
1938 if last_ends_with_alnum(last) && first_is_alnum_or_underscore(next) {
1939 return true;
1940 }
1941 // Adjacent operator runs: keep them apart so the lexer doesn't glue
1942 // `>` and `=` into `>=` unintentionally.
1943 true
1944}
1945
1946fn is_unary_prefix_operator(s: &str) -> bool {
1947 matches!(s, "-" | "+" | "!" | "~")
1948}
1949
1950fn first_is_operand_start(s: &str) -> bool {
1951 s.chars()
1952 .next()
1953 .map(|c| c.is_alphanumeric() || c == '_' || c == '.' || c == '(')
1954 .unwrap_or(false)
1955}
1956
1957fn is_punct_open(s: &str) -> bool {
1958 matches!(s, "(" | "[" | "{" | "\"" | "'" | "`")
1959}
1960
1961fn is_punct_close(s: &str) -> bool {
1962 matches!(s, ")" | "]" | "}" | "," | ";" | ":" | "\"" | "'" | "`")
1963}
1964
1965fn is_punct_punctuation(s: &str) -> bool {
1966 matches!(s, "," | ";" | ":" | "." | ")" | "]" | "}")
1967}
1968
1969fn last_is_word_like(s: &str) -> bool {
1970 s.chars()
1971 .next_back()
1972 .map(|c| c.is_alphanumeric() || c == '_')
1973 .unwrap_or(false)
1974}
1975
1976fn first_is_word_like(s: &str) -> bool {
1977 s.chars()
1978 .next()
1979 .map(|c| c.is_alphanumeric() || c == '_')
1980 .unwrap_or(false)
1981}
1982
1983fn last_ends_with_alnum(s: &str) -> bool {
1984 s.chars()
1985 .next_back()
1986 .map(char::is_alphanumeric)
1987 .unwrap_or(false)
1988}
1989
1990fn first_is_alnum_or_underscore(s: &str) -> bool {
1991 s.chars()
1992 .next()
1993 .map(|c| c.is_alphanumeric() || c == '_')
1994 .unwrap_or(false)
1995}
1996
1997#[cfg(test)]
1998mod tests {
1999 use super::*;
2000
2001 #[test]
2002 fn parses_simple_grammar_json() {
2003 let bytes = br#"{
2004 "name": "tiny",
2005 "rules": {
2006 "program": {
2007 "type": "SEQ",
2008 "members": [
2009 {"type": "STRING", "value": "hello"},
2010 {"type": "STRING", "value": ";"}
2011 ]
2012 }
2013 }
2014 }"#;
2015 let g = Grammar::from_bytes("tiny", bytes).expect("valid tiny grammar");
2016 assert!(g.rules.contains_key("program"));
2017 }
2018
2019 #[test]
2020 fn output_emits_punctuation_without_leading_space() {
2021 let policy = FormatPolicy::default();
2022 let mut out = Output::new(&policy);
2023 out.token("foo");
2024 out.token("(");
2025 out.token(")");
2026 out.token(";");
2027 let bytes = out.finish();
2028 let s = std::str::from_utf8(&bytes).expect("ascii output");
2029 assert!(s.starts_with("foo();"), "got {s:?}");
2030 }
2031
2032 #[test]
2033 fn grammar_from_bytes_rejects_malformed_input() {
2034 let result = Grammar::from_bytes("malformed", b"not json");
2035 let err = result.expect_err("malformed bytes must yield Err");
2036 let msg = err.to_string();
2037 assert!(
2038 msg.contains("malformed"),
2039 "error message should name the protocol: {msg:?}"
2040 );
2041 }
2042
2043 #[test]
2044 fn output_indents_after_open_brace() {
2045 let policy = FormatPolicy::default();
2046 let mut out = Output::new(&policy);
2047 out.token("fn");
2048 out.token("foo");
2049 out.token("(");
2050 out.token(")");
2051 out.token("{");
2052 out.token("body");
2053 out.token("}");
2054 let bytes = out.finish();
2055 let s = std::str::from_utf8(&bytes).expect("ascii output");
2056 assert!(s.contains("{\n"), "newline after opening brace: {s:?}");
2057 assert!(s.contains("body"), "body inside block: {s:?}");
2058 assert!(s.ends_with("}\n"), "newline after closing brace: {s:?}");
2059 }
2060
2061 #[test]
2062 fn output_no_space_between_word_and_dot() {
2063 let policy = FormatPolicy::default();
2064 let mut out = Output::new(&policy);
2065 out.token("foo");
2066 out.token(".");
2067 out.token("bar");
2068 let bytes = out.finish();
2069 let s = std::str::from_utf8(&bytes).expect("ascii output");
2070 assert!(s.starts_with("foo.bar"), "no space around dot: {s:?}");
2071 }
2072
2073 #[test]
2074 fn output_snapshot_restore_truncates_bytes() {
2075 let policy = FormatPolicy::default();
2076 let mut out = Output::new(&policy);
2077 out.token("keep");
2078 let snap = out.snapshot();
2079 out.token("drop");
2080 out.token("more");
2081 out.restore(snap);
2082 out.token("after");
2083 let bytes = out.finish();
2084 let s = std::str::from_utf8(&bytes).expect("ascii output");
2085 assert!(s.contains("keep"), "kept token survives: {s:?}");
2086 assert!(s.contains("after"), "post-restore token visible: {s:?}");
2087 assert!(!s.contains("drop"), "rolled-back token removed: {s:?}");
2088 assert!(!s.contains("more"), "rolled-back token removed: {s:?}");
2089 }
2090
2091 #[test]
2092 fn child_cursor_take_field_consumes_once() {
2093 let edges_owned: Vec<Edge> = vec![Edge {
2094 src: panproto_gat::Name::from("p"),
2095 tgt: panproto_gat::Name::from("c"),
2096 kind: panproto_gat::Name::from("name"),
2097 name: None,
2098 }];
2099 let edges: Vec<&Edge> = edges_owned.iter().collect();
2100 let mut cursor = ChildCursor::new(&edges);
2101 let first = cursor.take_field("name");
2102 let second = cursor.take_field("name");
2103 assert!(first.is_some(), "first take returns the edge");
2104 assert!(
2105 second.is_none(),
2106 "second take returns None (already consumed)"
2107 );
2108 }
2109
2110 #[test]
2111 fn child_cursor_take_matching_predicate() {
2112 let edges_owned: Vec<Edge> = vec![
2113 Edge {
2114 src: "p".into(),
2115 tgt: "c1".into(),
2116 kind: "child_of".into(),
2117 name: None,
2118 },
2119 Edge {
2120 src: "p".into(),
2121 tgt: "c2".into(),
2122 kind: "key".into(),
2123 name: None,
2124 },
2125 ];
2126 let edges: Vec<&Edge> = edges_owned.iter().collect();
2127 let mut cursor = ChildCursor::new(&edges);
2128 assert!(cursor.has_matching(|e| e.kind.as_ref() == "key"));
2129 let taken = cursor.take_matching(|e| e.kind.as_ref() == "key");
2130 assert!(taken.is_some());
2131 assert!(
2132 !cursor.has_matching(|e| e.kind.as_ref() == "key"),
2133 "consumed edge no longer matches"
2134 );
2135 assert!(
2136 cursor.has_matching(|e| e.kind.as_ref() == "child_of"),
2137 "the other edge is still available"
2138 );
2139 }
2140
2141 #[test]
2142 fn kind_satisfies_symbol_direct_match() {
2143 let bytes = br#"{
2144 "name": "tiny",
2145 "rules": {
2146 "x": {"type": "STRING", "value": "x"}
2147 }
2148 }"#;
2149 let g = Grammar::from_bytes("tiny", bytes).expect("valid grammar");
2150 assert!(kind_satisfies_symbol(&g, Some("x"), "x"));
2151 assert!(!kind_satisfies_symbol(&g, Some("y"), "x"));
2152 assert!(!kind_satisfies_symbol(&g, None, "x"));
2153 }
2154
2155 #[test]
2156 fn kind_satisfies_symbol_through_hidden_rule() {
2157 let bytes = br#"{
2158 "name": "tiny",
2159 "rules": {
2160 "_value": {
2161 "type": "CHOICE",
2162 "members": [
2163 {"type": "SYMBOL", "name": "object"},
2164 {"type": "SYMBOL", "name": "number"}
2165 ]
2166 },
2167 "object": {"type": "STRING", "value": "{}"},
2168 "number": {"type": "PATTERN", "value": "[0-9]+"}
2169 }
2170 }"#;
2171 let g = Grammar::from_bytes("tiny", bytes).expect("valid grammar");
2172 assert!(
2173 kind_satisfies_symbol(&g, Some("number"), "_value"),
2174 "number is reachable from _value via CHOICE"
2175 );
2176 assert!(
2177 kind_satisfies_symbol(&g, Some("object"), "_value"),
2178 "object is reachable from _value via CHOICE"
2179 );
2180 assert!(
2181 !kind_satisfies_symbol(&g, Some("string"), "_value"),
2182 "string is NOT among the alternatives"
2183 );
2184 }
2185
2186 #[test]
2187 fn first_symbol_skips_string_terminals() {
2188 let prod: Production = serde_json::from_str(
2189 r#"{
2190 "type": "SEQ",
2191 "members": [
2192 {"type": "STRING", "value": "{"},
2193 {"type": "SYMBOL", "name": "body"},
2194 {"type": "STRING", "value": "}"}
2195 ]
2196 }"#,
2197 )
2198 .expect("valid SEQ");
2199 assert_eq!(first_symbol(&prod), Some("body"));
2200 }
2201
2202 #[test]
2203 fn placeholder_for_pattern_routes_by_regex_class() {
2204 assert_eq!(placeholder_for_pattern("[0-9]+"), "0");
2205 assert_eq!(placeholder_for_pattern("[a-zA-Z_]\\w*"), "_x");
2206 assert_eq!(placeholder_for_pattern("\"[^\"]*\""), "\"\"");
2207 assert_eq!(placeholder_for_pattern("\\d+\\.\\d+"), "0");
2208 }
2209
2210 #[test]
2211 fn format_policy_default_breaks_after_semicolon() {
2212 let policy = FormatPolicy::default();
2213 assert!(policy.line_break_after.iter().any(|t| t == ";"));
2214 assert!(policy.indent_open.iter().any(|t| t == "{"));
2215 assert!(policy.indent_close.iter().any(|t| t == "}"));
2216 assert_eq!(policy.indent_width, 2);
2217 }
2218
2219 #[test]
2220 fn placeholder_decodes_literal_pattern_separators() {
2221 // PATTERN regexes that match a single literal byte sequence
2222 // (newline, semicolon, comma) emit the bytes verbatim instead
2223 // of falling through to the `_` catch-all.
2224 assert_eq!(placeholder_for_pattern("\\n"), "\n");
2225 assert_eq!(placeholder_for_pattern("\\r\\n"), "\r\n");
2226 assert_eq!(placeholder_for_pattern(";"), ";");
2227 // Patterns with character classes / alternation still route
2228 // through the heuristic.
2229 assert_eq!(placeholder_for_pattern("[0-9]+"), "0");
2230 assert_eq!(placeholder_for_pattern("a|b"), "_");
2231 }
2232
2233 #[test]
2234 fn supertypes_decode_from_grammar_json_strings() {
2235 // Tree-sitter older grammars list supertypes as bare strings.
2236 let bytes = br#"{
2237 "name": "tiny",
2238 "supertypes": ["expression"],
2239 "rules": {
2240 "expression": {
2241 "type": "CHOICE",
2242 "members": [
2243 {"type": "SYMBOL", "name": "binary_expression"},
2244 {"type": "SYMBOL", "name": "identifier"}
2245 ]
2246 },
2247 "binary_expression": {"type": "STRING", "value": "x"},
2248 "identifier": {"type": "PATTERN", "value": "[a-z]+"}
2249 }
2250 }"#;
2251 let g = Grammar::from_bytes("tiny", bytes).expect("parse");
2252 assert!(g.supertypes.contains("expression"));
2253 // identifier matches the supertype `expression`.
2254 assert!(kind_satisfies_symbol(&g, Some("identifier"), "expression"));
2255 // unrelated kinds do not.
2256 assert!(!kind_satisfies_symbol(&g, Some("string"), "expression"));
2257 }
2258
2259 #[test]
2260 fn supertypes_decode_from_grammar_json_objects() {
2261 // Recent grammars list supertypes as `{type: SYMBOL, name: ...}`
2262 // entries instead of bare strings.
2263 let bytes = br#"{
2264 "name": "tiny",
2265 "supertypes": [{"type": "SYMBOL", "name": "stmt"}],
2266 "rules": {
2267 "stmt": {
2268 "type": "CHOICE",
2269 "members": [
2270 {"type": "SYMBOL", "name": "while_stmt"},
2271 {"type": "SYMBOL", "name": "if_stmt"}
2272 ]
2273 },
2274 "while_stmt": {"type": "STRING", "value": "while"},
2275 "if_stmt": {"type": "STRING", "value": "if"}
2276 }
2277 }"#;
2278 let g = Grammar::from_bytes("tiny", bytes).expect("parse");
2279 assert!(g.supertypes.contains("stmt"));
2280 assert!(kind_satisfies_symbol(&g, Some("while_stmt"), "stmt"));
2281 }
2282
2283 #[test]
2284 fn alias_value_matches_kind() {
2285 // A named ALIAS rewrites the parser-visible kind to `value`;
2286 // `kind_satisfies_symbol` should accept that rewritten kind
2287 // when looking up the original SYMBOL.
2288 let bytes = br#"{
2289 "name": "tiny",
2290 "rules": {
2291 "_package_identifier": {
2292 "type": "ALIAS",
2293 "named": true,
2294 "value": "package_identifier",
2295 "content": {"type": "SYMBOL", "name": "identifier"}
2296 },
2297 "identifier": {"type": "PATTERN", "value": "[a-z]+"}
2298 }
2299 }"#;
2300 let g = Grammar::from_bytes("tiny", bytes).expect("parse");
2301 assert!(kind_satisfies_symbol(
2302 &g,
2303 Some("package_identifier"),
2304 "_package_identifier"
2305 ));
2306 }
2307
2308 #[test]
2309 fn referenced_symbols_walks_nested_seq() {
2310 let prod: Production = serde_json::from_str(
2311 r#"{
2312 "type": "SEQ",
2313 "members": [
2314 {"type": "CHOICE", "members": [
2315 {"type": "SYMBOL", "name": "attribute_item"},
2316 {"type": "BLANK"}
2317 ]},
2318 {"type": "SYMBOL", "name": "parameter"},
2319 {"type": "REPEAT", "content": {
2320 "type": "SEQ",
2321 "members": [
2322 {"type": "STRING", "value": ","},
2323 {"type": "SYMBOL", "name": "parameter"}
2324 ]
2325 }}
2326 ]
2327 }"#,
2328 )
2329 .expect("seq");
2330 let symbols = referenced_symbols(&prod);
2331 assert!(symbols.contains(&"attribute_item"));
2332 assert!(symbols.contains(&"parameter"));
2333 }
2334
2335 #[test]
2336 fn literal_strings_collects_choice_members() {
2337 let prod: Production = serde_json::from_str(
2338 r#"{
2339 "type": "CHOICE",
2340 "members": [
2341 {"type": "STRING", "value": "+"},
2342 {"type": "STRING", "value": "-"},
2343 {"type": "STRING", "value": "*"}
2344 ]
2345 }"#,
2346 )
2347 .expect("choice");
2348 let strings = literal_strings(&prod);
2349 assert_eq!(strings, vec!["+", "-", "*"]);
2350 }
2351
2352 /// The ocaml and javascript grammars (tree-sitter ≥ 0.25) emit a
2353 /// `RESERVED` rule kind that an earlier deserialiser rejected
2354 /// with `unknown variant "RESERVED"`. Verify both that the bare
2355 /// variant deserialises and that a `RESERVED`-wrapped grammar is
2356 /// loadable end-to-end via [`Grammar::from_bytes`].
2357 #[test]
2358 fn reserved_variant_deserialises() {
2359 let prod: Production = serde_json::from_str(
2360 r#"{
2361 "type": "RESERVED",
2362 "content": {"type": "SYMBOL", "name": "_lowercase_identifier"},
2363 "context_name": "attribute_id"
2364 }"#,
2365 )
2366 .expect("RESERVED parses");
2367 match prod {
2368 Production::Reserved { content, .. } => match *content {
2369 Production::Symbol { name } => assert_eq!(name, "_lowercase_identifier"),
2370 other => panic!("expected inner SYMBOL, got {other:?}"),
2371 },
2372 other => panic!("expected RESERVED, got {other:?}"),
2373 }
2374 }
2375
2376 #[test]
2377 fn reserved_grammar_loads_end_to_end() {
2378 let bytes = br#"{
2379 "name": "tiny_reserved",
2380 "rules": {
2381 "program": {
2382 "type": "RESERVED",
2383 "content": {"type": "SYMBOL", "name": "ident"},
2384 "context_name": "keywords"
2385 },
2386 "ident": {"type": "PATTERN", "value": "[a-z]+"}
2387 }
2388 }"#;
2389 let g = Grammar::from_bytes("tiny_reserved", bytes).expect("RESERVED-using grammar loads");
2390 assert!(g.rules.contains_key("program"));
2391 }
2392
2393 #[test]
2394 fn reserved_walker_helpers_recurse_into_content() {
2395 // The walker's helpers (first_symbol, has_field_in,
2396 // referenced_symbols, ...) all need to descend through
2397 // RESERVED into its content. If they bail at RESERVED, the
2398 // `pick_choice_with_cursor` heuristic ranks the alt below
2399 // alts that DO recurse, which produces wrong emit output
2400 // even when the deserialiser doesn't crash.
2401 let prod: Production = serde_json::from_str(
2402 r#"{
2403 "type": "RESERVED",
2404 "content": {
2405 "type": "FIELD",
2406 "name": "lhs",
2407 "content": {"type": "SYMBOL", "name": "expr"}
2408 },
2409 "context_name": "ctx"
2410 }"#,
2411 )
2412 .expect("nested RESERVED parses");
2413 assert_eq!(first_symbol(&prod), Some("expr"));
2414 assert!(has_field_in(&prod, &["lhs"]));
2415 let symbols = referenced_symbols(&prod);
2416 assert!(symbols.contains(&"expr"));
2417 }
2418}