panproto_parse/emit_pretty.rs
1#![allow(
2 clippy::module_name_repetitions,
3 clippy::too_many_lines,
4 clippy::too_many_arguments,
5 clippy::map_unwrap_or,
6 clippy::option_if_let_else,
7 clippy::elidable_lifetime_names,
8 clippy::items_after_statements,
9 clippy::needless_pass_by_value,
10 clippy::single_match_else,
11 clippy::manual_let_else,
12 clippy::match_same_arms,
13 clippy::missing_const_for_fn,
14 clippy::single_char_pattern,
15 clippy::naive_bytecount,
16 clippy::expect_used,
17 clippy::redundant_pub_crate,
18 clippy::used_underscore_binding,
19 clippy::redundant_field_names,
20 clippy::struct_field_names,
21 clippy::redundant_else,
22 clippy::similar_names
23)]
24
25//! De-novo source emission from a by-construction schema.
26//!
27//! [`AstParser::emit`] reconstructs source from byte-position fragments
28//! that the parser stored on the schema during `parse`. That works for
29//! edit pipelines (`parse → transform → emit`) but fails for schemas
30//! built by hand (`SchemaBuilder` with no parse history): they carry
31//! no `start-byte`, no `interstitial-N`, no `literal-value`, and the
32//! reconstructor returns `Err(EmitFailed { reason: "schema has no
33//! text fragments" })`.
34//!
35//! This module renders such schemas to source bytes by walking
36//! tree-sitter's `grammar.json` production rules. For each schema
37//! vertex of kind `K`, the walker looks up `K`'s production in the
38//! grammar and emits its body in order:
39//!
40//! - `STRING` nodes contribute literal token bytes directly.
41//! - `SYMBOL` and `FIELD` nodes recurse into the schema's children,
42//! matching by edge kind (which is the tree-sitter field name).
43//! - `SEQ` emits its members in order.
44//! - `CHOICE` picks the alternative whose head `SYMBOL` matches an
45//! actual child kind, or whose terminals appear in the rendered
46//! prefix; falls back to the first non-`BLANK` alternative when no
47//! alternative matches.
48//! - `REPEAT` and `REPEAT1` emit their content once per matching
49//! child edge in declared order.
50//! - `OPTIONAL` emits its content iff a corresponding child edge or
51//! constraint is populated.
52//! - `PATTERN` is a regex placeholder for variable-text terminals
53//! (identifiers, numbers, quoted strings). The walker emits a
54//! `literal-value` constraint when present and otherwise falls
55//! back to a placeholder derived from the regex shape.
56//! - `BLANK`, `TOKEN`, `IMMEDIATE_TOKEN`, `ALIAS`, `PREC*` are
57//! handled transparently (the inner content is emitted; the
58//! wrapper is dropped).
59//!
60//! Whitespace and indentation come from a `FormatPolicy` applied
61//! during emission. The default policy inserts a single space between
62//! adjacent tokens, a newline after `;` / `}` / `{`, and tracks an
63//! indent counter on `{` / `}` boundaries.
64//!
65//! Output is *syntactically valid* for any grammar that ships
66//! `grammar.json`. Idiomatic formatting (rustfmt-style spacing rules,
67//! per-language conventions) is a polish layer that lives outside
68//! this module.
69
70use std::collections::BTreeMap;
71
72use panproto_schema::{Edge, Schema};
73use serde::Deserialize;
74
75use crate::error::ParseError;
76
77// ═══════════════════════════════════════════════════════════════════
78// Grammar JSON model
79// ═══════════════════════════════════════════════════════════════════
80
81/// A single tree-sitter production rule.
82///
83/// Mirrors the shape emitted by `tree-sitter generate`: every node has
84/// a `type` discriminator that selects a structural variant. The
85/// untyped subset (`PATTERN`, `STRING`, `SYMBOL`, `BLANK`) handles
86/// terminals; the structural subset (`SEQ`, `CHOICE`, `REPEAT`,
87/// `REPEAT1`, `OPTIONAL`, `FIELD`, `ALIAS`, `TOKEN`,
88/// `IMMEDIATE_TOKEN`, `PREC*`) builds composite productions.
89#[derive(Debug, Clone, Deserialize)]
90#[serde(tag = "type")]
91#[non_exhaustive]
92pub enum Production {
93 /// Concatenation of productions.
94 #[serde(rename = "SEQ")]
95 Seq {
96 /// Ordered members; each is emitted in turn.
97 members: Vec<Self>,
98 },
99 /// Alternation between productions.
100 #[serde(rename = "CHOICE")]
101 Choice {
102 /// Alternatives; the walker picks one based on the schema's
103 /// children and constraints.
104 members: Vec<Self>,
105 },
106 /// Zero-or-more repetition.
107 #[serde(rename = "REPEAT")]
108 Repeat {
109 /// The repeated body.
110 content: Box<Self>,
111 },
112 /// One-or-more repetition.
113 #[serde(rename = "REPEAT1")]
114 Repeat1 {
115 /// The repeated body.
116 content: Box<Self>,
117 },
118 /// Optional inclusion (zero or one).
119 ///
120 /// Tree-sitter usually emits `OPTIONAL` as `CHOICE { content,
121 /// BLANK }`, but recent generator versions also emit explicit
122 /// `OPTIONAL` nodes; both shapes are accepted.
123 #[serde(rename = "OPTIONAL")]
124 Optional {
125 /// The optional body.
126 content: Box<Self>,
127 },
128 /// Reference to another rule by name.
129 #[serde(rename = "SYMBOL")]
130 Symbol {
131 /// Name of the referenced rule (matches a vertex kind on the
132 /// schema side).
133 name: String,
134 },
135 /// Literal token bytes.
136 #[serde(rename = "STRING")]
137 String {
138 /// The literal token. Emitted verbatim.
139 value: String,
140 },
141 /// Regex-matched terminal.
142 ///
143 /// At parse time this matches arbitrary bytes; at emit time the
144 /// walker substitutes a `literal-value` constraint when present
145 /// and falls back to a placeholder otherwise.
146 #[serde(rename = "PATTERN")]
147 Pattern {
148 /// The original regex.
149 value: String,
150 },
151 /// The empty production. Emits nothing.
152 #[serde(rename = "BLANK")]
153 Blank,
154 /// Named field over a content production.
155 ///
156 /// The field `name` matches an edge kind on the schema side; the
157 /// walker resolves the corresponding child vertex and recurses
158 /// into `content` with that child as context.
159 #[serde(rename = "FIELD")]
160 Field {
161 /// Field name (matches edge kind).
162 name: String,
163 /// The contents of the field.
164 content: Box<Self>,
165 },
166 /// An aliased production.
167 ///
168 /// `value` records the parser-visible kind; the walker emits
169 /// `content` and ignores the alias rename.
170 #[serde(rename = "ALIAS")]
171 Alias {
172 /// The aliased content.
173 content: Box<Self>,
174 /// Whether the alias is a named node.
175 #[serde(default)]
176 named: bool,
177 /// The alias's surface name.
178 #[serde(default)]
179 value: String,
180 },
181 /// A token wrapper.
182 ///
183 /// Tree-sitter uses `TOKEN` to mark a sub-rule as a single
184 /// lexical token; the walker emits the inner content unchanged.
185 #[serde(rename = "TOKEN")]
186 Token {
187 /// The wrapped content.
188 content: Box<Self>,
189 },
190 /// An immediate-token wrapper (no preceding whitespace).
191 ///
192 /// Treated like [`Production::Token`] for emit purposes.
193 #[serde(rename = "IMMEDIATE_TOKEN")]
194 ImmediateToken {
195 /// The wrapped content.
196 content: Box<Self>,
197 },
198 /// Precedence wrapper.
199 #[serde(rename = "PREC")]
200 Prec {
201 /// Precedence value (numeric or string). Ignored at emit time.
202 #[allow(dead_code)]
203 value: serde_json::Value,
204 /// The wrapped content.
205 content: Box<Self>,
206 },
207 /// Left-associative precedence wrapper.
208 #[serde(rename = "PREC_LEFT")]
209 PrecLeft {
210 /// Precedence value. Ignored at emit time.
211 #[allow(dead_code)]
212 value: serde_json::Value,
213 /// The wrapped content.
214 content: Box<Self>,
215 },
216 /// Right-associative precedence wrapper.
217 #[serde(rename = "PREC_RIGHT")]
218 PrecRight {
219 /// Precedence value. Ignored at emit time.
220 #[allow(dead_code)]
221 value: serde_json::Value,
222 /// The wrapped content.
223 content: Box<Self>,
224 },
225 /// Dynamic precedence wrapper.
226 #[serde(rename = "PREC_DYNAMIC")]
227 PrecDynamic {
228 /// Precedence value. Ignored at emit time.
229 #[allow(dead_code)]
230 value: serde_json::Value,
231 /// The wrapped content.
232 content: Box<Self>,
233 },
234 /// Reserved-word wrapper (tree-sitter ≥ 0.25).
235 ///
236 /// Tree-sitter's `RESERVED` rule marks an inner production as a
237 /// reserved-word context: the parser excludes the listed identifiers
238 /// from being treated as the inner symbol. The `context_name`
239 /// metadata names the reserved-word set; the emitter does not need
240 /// it (we are walking schema → bytes, not enforcing reserved-word
241 /// constraints), so we emit the inner content unchanged, the same
242 /// way [`Production::Token`] and [`Production::ImmediateToken`] do.
243 #[serde(rename = "RESERVED")]
244 Reserved {
245 /// The wrapped content.
246 content: Box<Self>,
247 /// Name of the reserved-word context. Ignored at emit time.
248 #[allow(dead_code)]
249 #[serde(default)]
250 context_name: String,
251 },
252}
253
254/// A grammar's production-rule table, deserialized from `grammar.json`.
255///
256/// Only the fields the emitter consumes are decoded; precedences,
257/// conflicts, externals, and other parser-only metadata are ignored.
258#[derive(Debug, Clone, Deserialize)]
259#[non_exhaustive]
260pub struct Grammar {
261 /// Grammar name (e.g. `"rust"`, `"typescript"`).
262 #[allow(dead_code)]
263 pub name: String,
264 /// Map from rule name (a vertex kind on the schema side) to
265 /// production. Entries are kept in lexical order so iteration
266 /// is deterministic.
267 pub rules: BTreeMap<String, Production>,
268 /// Supertypes declared in the grammar's `supertypes` field. A
269 /// supertype is a rule whose body is a `CHOICE` of `SYMBOL`
270 /// references; tree-sitter parsers report a node's kind as one
271 /// of the subtypes (e.g. `identifier`, `typed_parameter`) rather
272 /// than the supertype name (`parameter`), so the emitter needs to
273 /// know that a child kind in a subtype set should match the
274 /// supertype name when a SYMBOL references it.
275 #[serde(default, deserialize_with = "deserialize_supertypes")]
276 pub supertypes: std::collections::HashSet<String>,
277 /// Tree-sitter `extras` rules: the named symbols (typically comments)
278 /// that tree-sitter skips at parse time but records as children of the
279 /// surrounding vertex. They appear nowhere in the production grammar,
280 /// so the rule walker cannot reconcile them against the cursor — the
281 /// emit pass therefore drains them as a side channel: at vertex entry
282 /// and between REPEAT iterations any leading extras-kind edges are
283 /// consumed and emitted directly. The set is populated at
284 /// `Grammar::from_bytes` by collecting every `SYMBOL { name }` and
285 /// named `ALIAS { value, named: true }` under the top-level `extras`
286 /// array. Pattern-only extras (e.g. `\s` whitespace) are not vertex
287 /// kinds and are excluded.
288 #[serde(default, deserialize_with = "deserialize_extras")]
289 pub extras: std::collections::HashSet<String>,
290 /// Precomputed subtyping closure: `subtypes[symbol_name]` is the
291 /// set of vertex kinds that satisfy a SYMBOL `symbol_name`
292 /// reference on the schema side.
293 ///
294 /// Built once at [`Grammar::from_bytes`] time by walking each
295 /// hidden rule (`_`-prefixed), declared supertype, and named
296 /// `ALIAS { value: K, ... }` production to its leaf SYMBOLs and
297 /// recording the closure. This replaces the prior heuristic
298 /// `kind_satisfies_symbol` that walked the rule body on every
299 /// query: lookups are now O(1) and the relation is exactly the
300 /// transitive closure of "is reachable via hidden / supertype /
301 /// alias dispatch", with no over-expansion through non-hidden
302 /// non-supertype rule references.
303 #[serde(skip)]
304 pub subtypes: std::collections::HashMap<String, std::collections::HashSet<String>>,
305}
306
307fn deserialize_supertypes<'de, D>(
308 deserializer: D,
309) -> Result<std::collections::HashSet<String>, D::Error>
310where
311 D: serde::Deserializer<'de>,
312{
313 let entries: Vec<serde_json::Value> = Vec::deserialize(deserializer)?;
314 let mut out = std::collections::HashSet::new();
315 for entry in entries {
316 match entry {
317 serde_json::Value::String(s) => {
318 out.insert(s);
319 }
320 serde_json::Value::Object(map) => {
321 if let Some(serde_json::Value::String(name)) = map.get("name") {
322 out.insert(name.clone());
323 }
324 }
325 _ => {}
326 }
327 }
328 Ok(out)
329}
330
331fn deserialize_extras<'de, D>(
332 deserializer: D,
333) -> Result<std::collections::HashSet<String>, D::Error>
334where
335 D: serde::Deserializer<'de>,
336{
337 let entries: Vec<serde_json::Value> = Vec::deserialize(deserializer)?;
338 let mut out = std::collections::HashSet::new();
339 for entry in entries {
340 if let serde_json::Value::Object(map) = entry {
341 let ty = map.get("type").and_then(serde_json::Value::as_str);
342 match ty {
343 // SYMBOL { name: K } — the extras rule is a named symbol
344 // (typically `line_comment` / `block_comment`). The kind
345 // K appears as a real child vertex on the schema side.
346 Some("SYMBOL") => {
347 if let Some(serde_json::Value::String(name)) = map.get("name") {
348 out.insert(name.clone());
349 }
350 }
351 // ALIAS { content, value: V, named: true } — the extras
352 // rule renames its content; V is the kind on the schema.
353 Some("ALIAS") => {
354 let named = map
355 .get("named")
356 .and_then(serde_json::Value::as_bool)
357 .unwrap_or(false);
358 if named {
359 if let Some(serde_json::Value::String(value)) = map.get("value") {
360 out.insert(value.clone());
361 }
362 }
363 }
364 // PATTERN / STRING / TOKEN entries describe inter-token
365 // whitespace and have no vertex-side representation.
366 _ => {}
367 }
368 }
369 }
370 Ok(out)
371}
372
373impl Grammar {
374 /// Parse a grammar's `grammar.json` bytes.
375 ///
376 /// Builds the subtyping closure as part of construction so every
377 /// downstream lookup is O(1). The closure is the least relation
378 /// containing `(K, K)` for every rule key `K` and closed under:
379 ///
380 /// - hidden-rule expansion: if `S` is hidden and a SYMBOL `S` may
381 /// reach SYMBOL `K`, then `K ⊑ S`.
382 /// - supertype expansion: if `S` is in the grammar's supertypes
383 /// block and `K` is one of `S`'s alternatives, then `K ⊑ S`.
384 /// - alias renaming: if a rule body contains
385 /// `ALIAS { content: SYMBOL R, value: A, named: true }` where
386 /// `R` reaches kind `K` (or `K = R` when no further hop), then
387 /// `A ⊑ R` and `K ⊑ A`.
388 ///
389 /// # Errors
390 ///
391 /// Returns [`ParseError::EmitFailed`] when the bytes are not a
392 /// valid `grammar.json` document.
393 pub fn from_bytes(protocol: &str, bytes: &[u8]) -> Result<Self, ParseError> {
394 let mut grammar: Self =
395 serde_json::from_slice(bytes).map_err(|e| ParseError::EmitFailed {
396 protocol: protocol.to_owned(),
397 reason: format!("grammar.json deserialization failed: {e}"),
398 })?;
399 grammar.subtypes = compute_subtype_closure(&grammar);
400 Ok(grammar)
401 }
402}
403
404/// Compute the subtyping relation as a forward-indexed map from a
405/// SYMBOL name to the set of vertex kinds that satisfy that SYMBOL.
406fn compute_subtype_closure(
407 grammar: &Grammar,
408) -> std::collections::HashMap<String, std::collections::HashSet<String>> {
409 use std::collections::{HashMap, HashSet};
410 // Edges of the "kind X satisfies SYMBOL Y" relation. `K ⊑ Y` is
411 // recorded whenever Y is reached by walking the grammar's
412 // ALIAS / hidden-rule / supertype dispatch from a position where
413 // K is the actual vertex kind.
414 let mut subtypes: HashMap<String, HashSet<String>> = HashMap::new();
415 for name in grammar.rules.keys() {
416 subtypes
417 .entry(name.clone())
418 .or_default()
419 .insert(name.clone());
420 }
421
422 // First pass: collect the immediate "satisfies" edges from each
423 // expandable rule (hidden, supertype) to the kinds reachable by
424 // walking its body, plus alias edges.
425 fn walk<'g>(
426 grammar: &'g Grammar,
427 production: &'g Production,
428 visited: &mut HashSet<&'g str>,
429 out: &mut HashSet<String>,
430 ) {
431 match production {
432 Production::Symbol { name } => {
433 // Direct subtype.
434 out.insert(name.clone());
435 // Continue expansion through hidden / supertype rules
436 // so the closure traverses pass-through dispatch.
437 let expand = name.starts_with('_') || grammar.supertypes.contains(name.as_str());
438 if expand && visited.insert(name.as_str()) {
439 if let Some(rule) = grammar.rules.get(name) {
440 walk(grammar, rule, visited, out);
441 }
442 }
443 }
444 Production::Choice { members } | Production::Seq { members } => {
445 for m in members {
446 walk(grammar, m, visited, out);
447 }
448 }
449 Production::Alias {
450 content,
451 named,
452 value,
453 } => {
454 if *named && !value.is_empty() {
455 out.insert(value.clone());
456 }
457 walk(grammar, content, visited, out);
458 }
459 Production::Repeat { content }
460 | Production::Repeat1 { content }
461 | Production::Optional { content }
462 | Production::Field { content, .. }
463 | Production::Token { content }
464 | Production::ImmediateToken { content }
465 | Production::Prec { content, .. }
466 | Production::PrecLeft { content, .. }
467 | Production::PrecRight { content, .. }
468 | Production::PrecDynamic { content, .. }
469 | Production::Reserved { content, .. } => {
470 walk(grammar, content, visited, out);
471 }
472 _ => {}
473 }
474 }
475
476 for (name, rule) in &grammar.rules {
477 let expand = name.starts_with('_') || grammar.supertypes.contains(name.as_str());
478 if !expand {
479 continue;
480 }
481 let mut visited: HashSet<&str> = HashSet::new();
482 visited.insert(name.as_str());
483 let mut reachable: HashSet<String> = HashSet::new();
484 walk(grammar, rule, &mut visited, &mut reachable);
485 for kind in &reachable {
486 subtypes
487 .entry(kind.clone())
488 .or_default()
489 .insert(name.clone());
490 }
491 }
492
493 // Aliases: scan every rule body for ALIAS { content, value }
494 // declarations. The kinds reachable from `content` satisfy
495 // `value`, AND (by construction) `value` satisfies the
496 // surrounding rule. Walking the ENTIRE grammar once captures
497 // every alias site, irrespective of which rule introduces it.
498 fn collect_aliases<'g>(production: &'g Production, out: &mut Vec<(String, &'g Production)>) {
499 match production {
500 Production::Alias {
501 content,
502 named,
503 value,
504 } => {
505 if *named && !value.is_empty() {
506 out.push((value.clone(), content.as_ref()));
507 }
508 collect_aliases(content, out);
509 }
510 Production::Choice { members } | Production::Seq { members } => {
511 for m in members {
512 collect_aliases(m, out);
513 }
514 }
515 Production::Repeat { content }
516 | Production::Repeat1 { content }
517 | Production::Optional { content }
518 | Production::Field { content, .. }
519 | Production::Token { content }
520 | Production::ImmediateToken { content }
521 | Production::Prec { content, .. }
522 | Production::PrecLeft { content, .. }
523 | Production::PrecRight { content, .. }
524 | Production::PrecDynamic { content, .. }
525 | Production::Reserved { content, .. } => {
526 collect_aliases(content, out);
527 }
528 _ => {}
529 }
530 }
531 let mut aliases: Vec<(String, &Production)> = Vec::new();
532 for rule in grammar.rules.values() {
533 collect_aliases(rule, &mut aliases);
534 }
535 for (alias_value, content) in aliases {
536 let mut visited: HashSet<&str> = HashSet::new();
537 let mut reachable: HashSet<String> = HashSet::new();
538 walk(grammar, content, &mut visited, &mut reachable);
539 // Aliased value satisfies itself and is satisfied by every
540 // kind its content can reach.
541 subtypes
542 .entry(alias_value.clone())
543 .or_default()
544 .insert(alias_value.clone());
545 for kind in reachable {
546 subtypes
547 .entry(kind)
548 .or_default()
549 .insert(alias_value.clone());
550 }
551 }
552
553 // Transitive close: `K ⊑ A` and `A ⊑ B` implies `K ⊑ B`. Iterate
554 // a few rounds; the relation is small so a quick fixed-point
555 // suffices in practice.
556 for _ in 0..8 {
557 let snapshot = subtypes.clone();
558 let mut changed = false;
559 for (kind, supers) in &snapshot {
560 let extra: HashSet<String> = supers
561 .iter()
562 .flat_map(|s| snapshot.get(s).cloned().unwrap_or_default())
563 .collect();
564 let entry = subtypes.entry(kind.clone()).or_default();
565 for s in extra {
566 if entry.insert(s) {
567 changed = true;
568 }
569 }
570 }
571 if !changed {
572 break;
573 }
574 }
575
576 subtypes
577}
578
579// ═══════════════════════════════════════════════════════════════════
580// Format policy
581// ═══════════════════════════════════════════════════════════════════
582
583/// Whitespace and indentation policy applied during emission.
584///
585/// The default policy inserts a single space between adjacent tokens,
586/// a newline after `;` / `}` / `{`, and tracks indent on `{` / `}`
587/// boundaries. Per-language overrides (idiomatic indent width,
588/// trailing-comma rules, blank-line conventions) can ride alongside
589/// this struct in a follow-up branch; today's defaults aim only for
590/// syntactic validity.
591#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
592pub struct FormatPolicy {
593 /// Number of spaces per indent level.
594 pub indent_width: usize,
595 /// Separator inserted between adjacent terminals that the lexer
596 /// would otherwise glue together (word ↔ word, operator ↔ operator).
597 /// Default is a single space.
598 pub separator: String,
599 /// Newline byte sequence emitted after `line_break_after` tokens
600 /// and at end-of-output. Default is `"\n"`.
601 pub newline: String,
602 /// Tokens after which the walker breaks to a new line.
603 pub line_break_after: Vec<String>,
604 /// Tokens that increase indent on emission.
605 pub indent_open: Vec<String>,
606 /// Tokens that decrease indent on emission.
607 pub indent_close: Vec<String>,
608}
609
610impl Default for FormatPolicy {
611 fn default() -> Self {
612 Self {
613 indent_width: 2,
614 separator: " ".to_owned(),
615 newline: "\n".to_owned(),
616 line_break_after: vec![";".into(), "{".into(), "}".into()],
617 indent_open: vec!["{".into()],
618 indent_close: vec!["}".into()],
619 }
620 }
621}
622
623// ═══════════════════════════════════════════════════════════════════
624// Emitter
625// ═══════════════════════════════════════════════════════════════════
626
627/// Emit a by-construction schema to source bytes.
628///
629/// `protocol` is the grammar / language name (used in error messages
630/// and to label the entry point).
631///
632/// The walker treats `schema.entries` as the ordered list of root
633/// vertices, falling back to a deterministic by-id ordering when
634/// `entries` is empty. Each root is emitted using the production
635/// associated with its kind in `grammar.rules`.
636///
637/// # Errors
638///
639/// Returns [`ParseError::EmitFailed`] when:
640///
641/// - the schema has no vertices
642/// - a root vertex's kind is not a grammar rule
643/// - a `SYMBOL` reference points at a kind with no rule and no schema
644/// child to resolve it to
645/// - a required `FIELD` has no corresponding edge in the schema
646pub fn emit_pretty(
647 protocol: &str,
648 schema: &Schema,
649 grammar: &Grammar,
650 policy: &FormatPolicy,
651) -> Result<Vec<u8>, ParseError> {
652 let roots = collect_roots(schema);
653 if roots.is_empty() {
654 return Err(ParseError::EmitFailed {
655 protocol: protocol.to_owned(),
656 reason: "schema has no entry vertices".to_owned(),
657 });
658 }
659
660 let mut out = Output::new(policy);
661 for (i, root) in roots.iter().enumerate() {
662 if i > 0 {
663 out.newline();
664 }
665 emit_vertex(protocol, schema, grammar, root, &mut out)?;
666 }
667 Ok(out.finish())
668}
669
670fn collect_roots(schema: &Schema) -> Vec<&panproto_gat::Name> {
671 if !schema.entries.is_empty() {
672 return schema
673 .entries
674 .iter()
675 .filter(|name| schema.vertices.contains_key(*name))
676 .collect();
677 }
678
679 // Fallback: every vertex that is not the target of any structural edge
680 // (sorted by id for determinism).
681 let mut targets: std::collections::HashSet<&panproto_gat::Name> =
682 std::collections::HashSet::new();
683 for edge in schema.edges.keys() {
684 targets.insert(&edge.tgt);
685 }
686 let mut roots: Vec<&panproto_gat::Name> = schema
687 .vertices
688 .keys()
689 .filter(|name| !targets.contains(name))
690 .collect();
691 roots.sort();
692 roots
693}
694
695fn emit_vertex(
696 protocol: &str,
697 schema: &Schema,
698 grammar: &Grammar,
699 vertex_id: &panproto_gat::Name,
700 out: &mut Output<'_>,
701) -> Result<(), ParseError> {
702 let vertex = schema
703 .vertices
704 .get(vertex_id)
705 .ok_or_else(|| ParseError::EmitFailed {
706 protocol: protocol.to_owned(),
707 reason: format!("vertex '{vertex_id}' not found"),
708 })?;
709
710 // Leaf shortcut: a vertex carrying a `literal-value` constraint
711 // and no outgoing structural edges is a terminal token. Emit the
712 // captured value directly. This handles identifiers, numeric
713 // literals, and string literals that the parser stored as
714 // `literal-value` even on by-construction schemas.
715 if let Some(literal) = literal_value(schema, vertex_id) {
716 if children_for(schema, vertex_id).is_empty() {
717 out.token(literal);
718 return Ok(());
719 }
720 }
721
722 let kind = vertex.kind.as_ref();
723 let edges = children_for(schema, vertex_id);
724 if let Some(rule) = grammar.rules.get(kind) {
725 let mut cursor = ChildCursor::new(&edges);
726 emit_production(protocol, schema, grammar, vertex_id, rule, &mut cursor, out)?;
727 // Drain any extras left after the rule walk completed; tree-sitter
728 // may record trailing comments as children of the surrounding
729 // vertex (i.e. after the last structural child the rule matched).
730 drain_extras(protocol, schema, grammar, &mut cursor, out)?;
731 return Ok(());
732 }
733
734 // No rule for this kind. The parser produced it via an ALIAS
735 // (tree-sitter's `alias($.something, $.actual_kind)`) or via an
736 // external scanner (e.g. YAML's `document` root). Fall back to
737 // walking the children directly so the inner content survives;
738 // surrounding tokens — whose only source is the missing rule —
739 // are necessarily absent.
740 for edge in &edges {
741 emit_vertex(protocol, schema, grammar, &edge.tgt, out)?;
742 }
743 Ok(())
744}
745
746/// Linear cursor over a vertex's outgoing edges, used to thread
747/// children through a production rule without double-consuming them.
748struct ChildCursor<'a> {
749 edges: &'a [&'a Edge],
750 consumed: Vec<bool>,
751}
752
753impl<'a> ChildCursor<'a> {
754 fn new(edges: &'a [&'a Edge]) -> Self {
755 Self {
756 edges,
757 consumed: vec![false; edges.len()],
758 }
759 }
760
761 /// Take the next unconsumed edge whose kind equals `field_name`.
762 fn take_field(&mut self, field_name: &str) -> Option<&'a Edge> {
763 for (i, edge) in self.edges.iter().enumerate() {
764 if !self.consumed[i] && edge.kind.as_ref() == field_name {
765 self.consumed[i] = true;
766 return Some(edge);
767 }
768 }
769 None
770 }
771
772 /// Whether any unconsumed edge satisfies `predicate`. Used by the
773 /// unit tests; the live emit path went through `has_matching` on
774 /// each alternative until cursor-driven dispatch was rewritten to
775 /// pick the first-unconsumed-edge's kind directly.
776 #[cfg(test)]
777 fn has_matching(&self, predicate: impl Fn(&Edge) -> bool) -> bool {
778 self.edges
779 .iter()
780 .enumerate()
781 .any(|(i, edge)| !self.consumed[i] && predicate(edge))
782 }
783
784 /// Take the next unconsumed edge whose target vertex satisfies
785 /// `predicate`. Returns the edge and the underlying production
786 /// resolution path is the caller's job.
787 fn take_matching(&mut self, predicate: impl Fn(&Edge) -> bool) -> Option<&'a Edge> {
788 for (i, edge) in self.edges.iter().enumerate() {
789 if !self.consumed[i] && predicate(edge) {
790 self.consumed[i] = true;
791 return Some(edge);
792 }
793 }
794 None
795 }
796}
797
798thread_local! {
799 static EMIT_DEPTH: std::cell::Cell<usize> = const { std::cell::Cell::new(0) };
800 /// Set of `(vertex_id, rule_name)` pairs that are currently being
801 /// walked by the recursion. A SYMBOL that resolves to a rule
802 /// already on this stack closes a μ-binder cycle: in the
803 /// coinductive reading, the rule walk at any vertex is the least
804 /// fixed point of `body[μ X . body / X]`, which unfolds at most
805 /// once, with the second visit returning the empty sequence (the
806 /// unit of the free token monoid). Examples that trigger this:
807 /// YAML's `stream` ⊃ `_b_blk_*` mutually-recursive chain, Rust's
808 /// `_expression` ⊃ `binary_expression` ⊃ `_expression`.
809 static EMIT_MU_FRAMES: std::cell::RefCell<std::collections::HashSet<(String, String)>> =
810 std::cell::RefCell::new(std::collections::HashSet::new());
811 /// The name of the FIELD whose body the walker is currently inside,
812 /// or `None` at top level. Lets a SYMBOL nested arbitrarily deep
813 /// in the field's content (under SEQ, CHOICE, REPEAT, OPTIONAL)
814 /// consume from the *outer* cursor by edge-kind rather than from
815 /// the child's own cursor by symbol-match. Without this, shapes
816 /// like `field('args', commaSep1($.X))` — which expands to
817 /// `FIELD(SEQ(SYMBOL X, REPEAT(SEQ(',', SYMBOL X))))` — emit only
818 /// the first matched edge: the FIELD handler consumed one edge,
819 /// the inner REPEAT searched the consumed child's cursor (which
820 /// has no more sibling field edges), and the REPEAT broke after
821 /// one iteration. Setting the context here so the inner SYMBOL
822 /// pulls successive field-named edges from the outer cursor
823 /// recovers every matched edge across arbitrary nesting.
824 static EMIT_FIELD_CONTEXT: std::cell::RefCell<Option<String>> =
825 const { std::cell::RefCell::new(None) };
826}
827
828/// RAII guard that restores the prior `EMIT_FIELD_CONTEXT` value on drop.
829struct FieldContextGuard(Option<String>);
830
831impl Drop for FieldContextGuard {
832 fn drop(&mut self) {
833 EMIT_FIELD_CONTEXT.with(|f| *f.borrow_mut() = self.0.take());
834 }
835}
836
837fn push_field_context(name: &str) -> FieldContextGuard {
838 let prev = EMIT_FIELD_CONTEXT.with(|f| f.borrow_mut().replace(name.to_owned()));
839 FieldContextGuard(prev)
840}
841
842/// Clear the field context for the duration of a child-context walk.
843/// The child's own production has its own FIELDs that set their own
844/// context; the outer field hint must not leak into them.
845fn clear_field_context() -> FieldContextGuard {
846 let prev = EMIT_FIELD_CONTEXT.with(|f| f.borrow_mut().take());
847 FieldContextGuard(prev)
848}
849
850fn current_field_context() -> Option<String> {
851 EMIT_FIELD_CONTEXT.with(|f| f.borrow().clone())
852}
853
854/// Walk a rule at a vertex inside a μ-binder. The wrapping frame is
855/// pushed before recursion and popped after, so any SYMBOL inside
856/// `rule` that re-enters the same `(vertex_id, rule_name)` pair
857/// returns the empty sequence (μ X . body unfolds once).
858fn walk_in_mu_frame(
859 protocol: &str,
860 schema: &Schema,
861 grammar: &Grammar,
862 vertex_id: &panproto_gat::Name,
863 rule_name: &str,
864 rule: &Production,
865 cursor: &mut ChildCursor<'_>,
866 out: &mut Output<'_>,
867) -> Result<(), ParseError> {
868 let key = (vertex_id.to_string(), rule_name.to_owned());
869 let inserted = EMIT_MU_FRAMES.with(|frames| frames.borrow_mut().insert(key.clone()));
870 if !inserted {
871 // We are already walking this rule at this vertex deeper in
872 // the call stack. The coinductive μ-fixed-point reading
873 // returns the empty sequence here; the surrounding
874 // production resumes after the SYMBOL.
875 return Ok(());
876 }
877 let result = emit_production(protocol, schema, grammar, vertex_id, rule, cursor, out);
878 EMIT_MU_FRAMES.with(|frames| {
879 frames.borrow_mut().remove(&key);
880 });
881 result
882}
883
884fn emit_production(
885 protocol: &str,
886 schema: &Schema,
887 grammar: &Grammar,
888 vertex_id: &panproto_gat::Name,
889 production: &Production,
890 cursor: &mut ChildCursor<'_>,
891 out: &mut Output<'_>,
892) -> Result<(), ParseError> {
893 let depth = EMIT_DEPTH.with(|d| {
894 let v = d.get() + 1;
895 d.set(v);
896 v
897 });
898 if depth > 500 {
899 EMIT_DEPTH.with(|d| d.set(d.get() - 1));
900 return Err(ParseError::EmitFailed {
901 protocol: protocol.to_owned(),
902 reason: format!(
903 "emit_production recursion >500 (likely a cyclic grammar; \
904 vertex='{vertex_id}')"
905 ),
906 });
907 }
908 drain_extras(protocol, schema, grammar, cursor, out)?;
909 let result = emit_production_inner(
910 protocol, schema, grammar, vertex_id, production, cursor, out,
911 );
912 EMIT_DEPTH.with(|d| d.set(d.get() - 1));
913 result
914}
915
916/// Consume and emit every leading edge on `cursor` whose target kind
917/// is in `grammar.extras` (typically `line_comment` / `block_comment`).
918/// Extras live outside the production grammar — tree-sitter skips them
919/// at parse time and records them as children of the surrounding
920/// vertex — so the rule walker cannot reconcile them against the
921/// cursor. Draining them as a side channel preserves their content in
922/// the output without confusing the structural matchers.
923fn drain_extras(
924 protocol: &str,
925 schema: &Schema,
926 grammar: &Grammar,
927 cursor: &mut ChildCursor<'_>,
928 out: &mut Output<'_>,
929) -> Result<(), ParseError> {
930 if grammar.extras.is_empty() {
931 return Ok(());
932 }
933 loop {
934 let next_extra: Option<usize> = cursor
935 .edges
936 .iter()
937 .enumerate()
938 .find(|(i, _)| !cursor.consumed[*i])
939 .and_then(|(i, edge)| {
940 let kind = schema.vertices.get(&edge.tgt).map(|v| v.kind.as_ref())?;
941 if grammar.extras.contains(kind) {
942 Some(i)
943 } else {
944 None
945 }
946 });
947 let Some(idx) = next_extra else {
948 return Ok(());
949 };
950 cursor.consumed[idx] = true;
951 let target = &cursor.edges[idx].tgt;
952 emit_vertex(protocol, schema, grammar, target, out)?;
953 }
954}
955
956fn emit_production_inner(
957 protocol: &str,
958 schema: &Schema,
959 grammar: &Grammar,
960 vertex_id: &panproto_gat::Name,
961 production: &Production,
962 cursor: &mut ChildCursor<'_>,
963 out: &mut Output<'_>,
964) -> Result<(), ParseError> {
965 match production {
966 Production::String { value } => {
967 out.token(value);
968 Ok(())
969 }
970 Production::Pattern { value } => {
971 if let Some(literal) = literal_value(schema, vertex_id) {
972 out.token(literal);
973 } else if is_newline_like_pattern(value) {
974 // Patterns like `\r?\n`, `\n`, `\r\n` are the structural
975 // newline tokens grammars use to separate top-level
976 // statements (csound's `_new_line`, ABC's line-end, etc.).
977 // Emitting them through the placeholder fallback rendered
978 // the bare `_` sentinel between siblings; route them to
979 // the layout pass's line-break instead so the output
980 // re-parses.
981 out.newline();
982 } else if is_whitespace_only_pattern(value) {
983 // `\s+`, `[ \t]+` and friends are interstitial whitespace
984 // tokens. Emit nothing: the layout pass inserts the
985 // policy separator between adjacent Lits if needed.
986 } else {
987 out.token(&placeholder_for_pattern(value));
988 }
989 Ok(())
990 }
991 Production::Blank => Ok(()),
992 Production::Symbol { name } => {
993 // Inside a FIELD body, a SYMBOL consumes by field-name on
994 // the outer cursor rather than searching by symbol-match.
995 // This covers the simple `FIELD(SYMBOL X)` case as well as
996 // every nesting under FIELD that contains SYMBOLs (SEQ,
997 // CHOICE, REPEAT, OPTIONAL, ALIAS). Without the override,
998 // shapes like `field('args', commaSep1($.X))` consume one
999 // field edge in the FIELD handler and then the REPEAT
1000 // inside SEQ searches the consumed child's cursor — where
1001 // no sibling field edges sit — and breaks after one
1002 // iteration.
1003 if let Some(field) = current_field_context() {
1004 if let Some(edge) = cursor.take_field(&field) {
1005 return emit_in_child_context(
1006 protocol, schema, grammar, &edge.tgt, production, out,
1007 );
1008 }
1009 // No matching field-named edge left on the outer
1010 // cursor. Surface nothing; the surrounding REPEAT /
1011 // OPTIONAL / CHOICE backtracks the literal tokens it
1012 // emitted on this iteration when it sees no progress.
1013 return Ok(());
1014 }
1015 if name.starts_with('_') {
1016 // Hidden rule: not a vertex kind on the schema side.
1017 // Inline-expand the rule body so its children take
1018 // edges from the current cursor, instead of trying to
1019 // take a single child edge that "satisfies" the
1020 // hidden rule and discarding the rest of the body
1021 // (which would drop tokens like `=` and the trailing
1022 // value SYMBOL inside e.g. TOML's `_inline_pair`).
1023 //
1024 // Wrapped in a μ-frame so a hidden rule that
1025 // references its own kind cyclically (or another
1026 // hidden rule that closes the cycle) unfolds once
1027 // and then collapses to the empty sequence at the
1028 // second visit, rather than blowing the stack.
1029 if let Some(rule) = grammar.rules.get(name) {
1030 walk_in_mu_frame(
1031 protocol, schema, grammar, vertex_id, name, rule, cursor, out,
1032 )
1033 } else {
1034 // External hidden rule (declared in the
1035 // grammar's `externals` block, scanned by C code,
1036 // not listed in `rules`). Heuristic fallback by
1037 // name:
1038 //
1039 // - `_indent` / `*_indent`: open an indent block.
1040 // Indent-based grammars (Python, YAML, qvr)
1041 // declare an `_indent` external scanner before
1042 // the body of a block-bodied declaration; the
1043 // emitted output is unparseable without the
1044 // corresponding indentation jump.
1045 // - `_dedent` / `*_dedent`: close the matching
1046 // indent block.
1047 // - `_newline` / `*_line_ending` / `*_or_eof`:
1048 // universally newline-or-empty; emitting a
1049 // single newline is the right default for
1050 // grammars like TOML whose `pair` SEQ trails
1051 // into `_line_ending_or_eof`.
1052 //
1053 // Anything else falls through silently — better
1054 // to drop an unknown external token than to
1055 // invent one that confuses re-parsing.
1056 if name == "_indent" || name.ends_with("_indent") {
1057 out.indent_open();
1058 } else if name == "_dedent" || name.ends_with("_dedent") {
1059 out.indent_close();
1060 } else if name.contains("line_ending")
1061 || name.contains("newline")
1062 || name.ends_with("_or_eof")
1063 {
1064 out.newline();
1065 }
1066 Ok(())
1067 }
1068 } else if let Some(edge) = take_symbol_match(grammar, schema, cursor, name) {
1069 // For supertype / hidden-rule dispatch the child's
1070 // own kind names the actual production to walk
1071 // (`child.kind` IS the subtype). For ALIAS the
1072 // dependent-optic context is carried by the
1073 // surrounding `Production::Alias` branch, which calls
1074 // `emit_aliased_child` directly; we don't reach here
1075 // for that case. So walking `grammar.rules[child.kind]`
1076 // via `emit_vertex` is correct: the dependent-optic
1077 // path is preserved at every site where it actually
1078 // diverges from `child.kind`.
1079 emit_vertex(protocol, schema, grammar, &edge.tgt, out)
1080 } else if vertex_id_kind(schema, vertex_id) == Some(name.as_str()) {
1081 let rule = grammar
1082 .rules
1083 .get(name)
1084 .ok_or_else(|| ParseError::EmitFailed {
1085 protocol: protocol.to_owned(),
1086 reason: format!("no production for SYMBOL '{name}'"),
1087 })?;
1088 // Self-reference (`X = ... SYMBOL X ...`): wrap in a
1089 // μ-frame so re-entry collapses to the empty sequence.
1090 walk_in_mu_frame(
1091 protocol, schema, grammar, vertex_id, name, rule, cursor, out,
1092 )
1093 } else {
1094 // Named rule with no matching child: emit nothing and
1095 // let the surrounding CHOICE / OPTIONAL / REPEAT
1096 // resolve the absence.
1097 Ok(())
1098 }
1099 }
1100 Production::Seq { members } => {
1101 for member in members {
1102 emit_production(protocol, schema, grammar, vertex_id, member, cursor, out)?;
1103 }
1104 Ok(())
1105 }
1106 Production::Choice { members } => {
1107 if let Some(matched) =
1108 pick_choice_with_cursor(schema, grammar, vertex_id, cursor, members)
1109 {
1110 emit_production(protocol, schema, grammar, vertex_id, matched, cursor, out)
1111 } else {
1112 Ok(())
1113 }
1114 }
1115 Production::Repeat { content } | Production::Repeat1 { content } => {
1116 // Detect a "separator-leading SEQ" iteration body: SEQ whose
1117 // first member is a CHOICE containing BLANK (or an OPTIONAL),
1118 // i.e. the source-level separator between two iterations is
1119 // syntactically optional. When the chosen alternative for
1120 // that separator slot emits zero content tokens at runtime,
1121 // there was no source-level separator between this iteration
1122 // and the previous one; the layout pass must suppress its
1123 // policy separator to match the source's tight adjacency.
1124 //
1125 // Categorical reading: REPEAT body `B = SEQ(SEP, BODY)` is
1126 // the pullback of two halves. The bytes emitted in iteration
1127 // k+1 are a concatenation of `SEP_k+1` and `BODY_k+1`; if
1128 // `SEP_k+1` is the empty word, the concatenation of
1129 // `BODY_k` and `BODY_k+1` must remain a single contiguous
1130 // span. Hence the NoSpace marker.
1131 let separator_leading_seq: Option<&[Production]> = match content.as_ref() {
1132 Production::Seq { members } if members.len() >= 2 => {
1133 let first = &members[0];
1134 let is_separator_slot = match first {
1135 Production::Choice { members } => {
1136 members.iter().any(|m| matches!(m, Production::Blank))
1137 }
1138 Production::Optional { .. } => true,
1139 _ => false,
1140 };
1141 if is_separator_slot {
1142 Some(members.as_slice())
1143 } else {
1144 None
1145 }
1146 }
1147 _ => None,
1148 };
1149
1150 let mut emitted_any = false;
1151 loop {
1152 let cursor_snap = cursor.consumed.clone();
1153 let out_snap = out.snapshot();
1154 let consumed_before = cursor.consumed.iter().filter(|&&c| c).count();
1155 let result: Result<(), ParseError> =
1156 if let Some(seq_members) = separator_leading_seq {
1157 // Emit the separator slot first and observe
1158 // whether it contributed any Lit. If not, push
1159 // a NoSpace marker before walking the remaining
1160 // SEQ members. The OutputSnapshot here covers
1161 // only the separator's emission window.
1162 let pre_sep = out.snapshot();
1163 let sep_result = emit_production(
1164 protocol,
1165 schema,
1166 grammar,
1167 vertex_id,
1168 &seq_members[0],
1169 cursor,
1170 out,
1171 );
1172 match sep_result {
1173 Err(e) => Err(e),
1174 Ok(()) => {
1175 if !out.lit_emitted_since(pre_sep) {
1176 out.no_space();
1177 }
1178 let mut rest_result = Ok(());
1179 for member in &seq_members[1..] {
1180 rest_result = emit_production(
1181 protocol, schema, grammar, vertex_id, member, cursor, out,
1182 );
1183 if rest_result.is_err() {
1184 break;
1185 }
1186 }
1187 rest_result
1188 }
1189 }
1190 } else {
1191 emit_production(protocol, schema, grammar, vertex_id, content, cursor, out)
1192 };
1193 let consumed_after = cursor.consumed.iter().filter(|&&c| c).count();
1194 if result.is_err() || consumed_after == consumed_before {
1195 cursor.consumed = cursor_snap;
1196 out.restore(out_snap);
1197 break;
1198 }
1199 emitted_any = true;
1200 }
1201 if matches!(production, Production::Repeat1 { .. }) && !emitted_any {
1202 emit_production(protocol, schema, grammar, vertex_id, content, cursor, out)?;
1203 }
1204 Ok(())
1205 }
1206 Production::Optional { content } => {
1207 let cursor_snap = cursor.consumed.clone();
1208 let out_snap = out.snapshot();
1209 let consumed_before = cursor.consumed.iter().filter(|&&c| c).count();
1210 let result =
1211 emit_production(protocol, schema, grammar, vertex_id, content, cursor, out);
1212 // OPTIONAL is a backtracking site: if the inner production
1213 // errored *or* made no progress without leaving a witness
1214 // constraint, restore both cursor and output to their
1215 // pre-attempt state. Mirrors `Repeat`'s loop body.
1216 if result.is_err() {
1217 cursor.consumed = cursor_snap;
1218 out.restore(out_snap);
1219 return result;
1220 }
1221 let consumed_after = cursor.consumed.iter().filter(|&&c| c).count();
1222 if consumed_after == consumed_before
1223 && !has_relevant_constraint(content, schema, vertex_id)
1224 {
1225 cursor.consumed = cursor_snap;
1226 out.restore(out_snap);
1227 }
1228 Ok(())
1229 }
1230 Production::Field { name, content } => {
1231 // Set the field context for the duration of `content`'s
1232 // walk and emit the content against the *outer* cursor.
1233 // The SYMBOL handler picks up the context and pulls
1234 // successive `take_field(name)` edges as it encounters
1235 // SYMBOLs anywhere under `content` (under SEQ, CHOICE,
1236 // REPEAT, OPTIONAL, ALIAS — arbitrarily nested). This
1237 // subsumes the prior carve-outs for FIELD(REPEAT(...)),
1238 // FIELD(REPEAT1(...)), and the bare FIELD(SYMBOL ...)
1239 // case, and adds coverage for
1240 // `field('xs', commaSep1($.X))` which expands to
1241 // FIELD(SEQ(SYMBOL X, REPEAT(SEQ(',', SYMBOL X)))) and
1242 // any other shape where REPEAT/REPEAT1 sits inside SEQ /
1243 // CHOICE / OPTIONAL under a FIELD. A FIELD that wraps a
1244 // non-SYMBOL production (e.g. `field('op', '+')` or
1245 // `field('op', CHOICE(STRING ...))`) still works: STRING
1246 // handlers ignore the context and emit literals
1247 // directly, so the operator token survives the round
1248 // trip.
1249 let _guard = push_field_context(name);
1250 emit_production(protocol, schema, grammar, vertex_id, content, cursor, out)
1251 }
1252 Production::Alias {
1253 content,
1254 named,
1255 value,
1256 } => {
1257 // A named ALIAS rewrites the parser-visible kind to
1258 // `value`. If the cursor has an unconsumed child whose
1259 // kind matches that alias name, take it and emit the
1260 // child using the alias's INNER content as the rule
1261 // (e.g. `ALIAS { SYMBOL real_rule, value: "kind_x" }`
1262 // means a `kind_x` vertex on the schema should be walked
1263 // through `real_rule`'s body, not through whatever rule
1264 // happens to be keyed under `kind_x`). This is the
1265 // dependent-optic shape: the rule the emitter walks at a
1266 // child position is determined by the parent's chosen
1267 // alias, not by the child kind alone — without it,
1268 // grammars like YAML that introduce the same kind through
1269 // many ALIAS sites lose the parent context the moment
1270 // emit_vertex is called.
1271 if *named && !value.is_empty() {
1272 if let Some(edge) = cursor.take_matching(|edge| {
1273 schema
1274 .vertices
1275 .get(&edge.tgt)
1276 .map(|v| v.kind.as_ref() == value.as_str())
1277 .unwrap_or(false)
1278 }) {
1279 return emit_aliased_child(protocol, schema, grammar, &edge.tgt, content, out);
1280 }
1281 }
1282 emit_production(protocol, schema, grammar, vertex_id, content, cursor, out)
1283 }
1284 Production::Token { content }
1285 | Production::ImmediateToken { content }
1286 | Production::Prec { content, .. }
1287 | Production::PrecLeft { content, .. }
1288 | Production::PrecRight { content, .. }
1289 | Production::PrecDynamic { content, .. }
1290 | Production::Reserved { content, .. } => {
1291 emit_production(protocol, schema, grammar, vertex_id, content, cursor, out)
1292 }
1293 }
1294}
1295
1296/// Take the next cursor edge whose target vertex's kind matches the
1297/// SYMBOL `name` directly or via inline expansion of a hidden rule.
1298fn take_symbol_match<'a>(
1299 grammar: &Grammar,
1300 schema: &Schema,
1301 cursor: &mut ChildCursor<'a>,
1302 name: &str,
1303) -> Option<&'a Edge> {
1304 cursor.take_matching(|edge| {
1305 let target_kind = schema.vertices.get(&edge.tgt).map(|v| v.kind.as_ref());
1306 kind_satisfies_symbol(grammar, target_kind, name)
1307 })
1308}
1309
1310/// Decide whether a schema vertex of kind `target_kind` satisfies a
1311/// SYMBOL `name` reference in the grammar.
1312///
1313/// Operates as an O(1) lookup against the precomputed subtype
1314/// closure built at [`Grammar::from_bytes`]. The semantic content is
1315/// "K satisfies SYMBOL S iff K is reachable from S by walking the
1316/// grammar's hidden, supertype, and named-alias dispatch": this is
1317/// exactly the relation tree-sitter induces on `(parser-visible kind,
1318/// rule-position)` pairs.
1319fn kind_satisfies_symbol(grammar: &Grammar, target_kind: Option<&str>, name: &str) -> bool {
1320 let Some(target) = target_kind else {
1321 return false;
1322 };
1323 if target == name {
1324 return true;
1325 }
1326 grammar
1327 .subtypes
1328 .get(target)
1329 .is_some_and(|set| set.contains(name))
1330}
1331
1332/// Emit a child reached through an ALIAS production using the
1333/// alias's inner content as the rule, not `grammar.rules[child.kind]`.
1334///
1335/// This carries the dependent-optic context across the ALIAS edge:
1336/// at the parent rule's site we know which underlying production the
1337/// alias wraps (typically `SYMBOL real_rule`), and that's the
1338/// production that should drive the emit walk on the child's
1339/// children. Looking up `grammar.rules.get(child.kind)` instead would
1340/// either fail (the renamed kind has no top-level rule, e.g. YAML's
1341/// `block_mapping_pair`) or pick an arbitrary same-kinded rule from
1342/// elsewhere in the grammar.
1343///
1344/// Walk-context invariant. The dependent-optic shape of `emit_pretty`
1345/// says: the production walked at any vertex is determined by the
1346/// path from the root through the grammar, not by the vertex kind in
1347/// isolation. Two dispatch sites realise that invariant:
1348///
1349/// * [`emit_vertex`] looks up `grammar.rules[child.kind]` and walks
1350/// it. Correct for supertype / hidden-rule dispatch: the child's
1351/// kind on the schema IS the subtype tree-sitter selected, so its
1352/// top-level rule is the right production to walk.
1353/// * `emit_aliased_child` threads the parent rule's `Production`
1354/// directly (the inner `content` of `Production::Alias`) and walks
1355/// it on the child's children. Correct for ALIAS dispatch: the
1356/// child's kind on the schema is the alias's `value` (a renamed
1357/// kind that may have no top-level rule), and the production to
1358/// walk is the alias's content body, supplied by the parent.
1359///
1360/// Together these cover every site where the rule-walked-at-child
1361/// diverges from `grammar.rules[child.kind]`; the recursion site for
1362/// plain SYMBOL therefore correctly delegates to `emit_vertex`, and
1363/// we do not need a richer `WalkContext` value passed by reference.
1364/// The grammar dependency is the thread.
1365fn emit_aliased_child(
1366 protocol: &str,
1367 schema: &Schema,
1368 grammar: &Grammar,
1369 child_id: &panproto_gat::Name,
1370 content: &Production,
1371 out: &mut Output<'_>,
1372) -> Result<(), ParseError> {
1373 // Leaf shortcut: if the child has a literal-value and no
1374 // structural children, emit the captured text. Identifiers and
1375 // similar terminals reach here when an ALIAS wraps a SYMBOL that
1376 // resolves to a PATTERN.
1377 if let Some(literal) = literal_value(schema, child_id) {
1378 if children_for(schema, child_id).is_empty() {
1379 out.token(literal);
1380 return Ok(());
1381 }
1382 }
1383
1384 // Resolve `content` to a rule when it's a SYMBOL (the dominant
1385 // shape: `ALIAS { content: SYMBOL real_rule, value: "kind_x" }`).
1386 if let Production::Symbol { name } = content {
1387 if let Some(rule) = grammar.rules.get(name) {
1388 let edges = children_for(schema, child_id);
1389 let mut cursor = ChildCursor::new(&edges);
1390 return emit_production(protocol, schema, grammar, child_id, rule, &mut cursor, out);
1391 }
1392 }
1393
1394 // Other ALIAS contents (CHOICE, SEQ, literals) walk in place.
1395 let edges = children_for(schema, child_id);
1396 let mut cursor = ChildCursor::new(&edges);
1397 emit_production(
1398 protocol,
1399 schema,
1400 grammar,
1401 child_id,
1402 content,
1403 &mut cursor,
1404 out,
1405 )
1406}
1407
1408fn emit_in_child_context(
1409 protocol: &str,
1410 schema: &Schema,
1411 grammar: &Grammar,
1412 child_id: &panproto_gat::Name,
1413 production: &Production,
1414 out: &mut Output<'_>,
1415) -> Result<(), ParseError> {
1416 // The child walks under its own production tree, with its own
1417 // FIELDs setting their own contexts. Clear the outer FIELD hint
1418 // so it does not leak through and cause sibling SYMBOLs inside
1419 // the child's body to mistakenly pull edges from the child's
1420 // cursor by the parent's field name.
1421 let _guard = clear_field_context();
1422 // If `production` is a structural wrapper (CHOICE / SEQ /
1423 // OPTIONAL / ...) whose referenced symbols cover the child's own
1424 // kind, the child IS the production's target node and the right
1425 // emit path is `emit_vertex(child)` (which honours the
1426 // literal-value leaf shortcut). Without this guard, FIELD(pattern,
1427 // CHOICE { _pattern, self }) on an identifier child walks the
1428 // CHOICE on the identifier's empty cursor, falls through to the
1429 // first non-BLANK alt, and loses the captured identifier text.
1430 if !matches!(production, Production::Symbol { .. }) {
1431 let child_kind = schema.vertices.get(child_id).map(|v| v.kind.as_ref());
1432 let symbols = referenced_symbols(production);
1433 if symbols
1434 .iter()
1435 .any(|s| kind_satisfies_symbol(grammar, child_kind, s) || child_kind == Some(s))
1436 {
1437 return emit_vertex(protocol, schema, grammar, child_id, out);
1438 }
1439 }
1440 match production {
1441 Production::Symbol { .. } => emit_vertex(protocol, schema, grammar, child_id, out),
1442 _ => {
1443 let edges = children_for(schema, child_id);
1444 let mut cursor = ChildCursor::new(&edges);
1445 emit_production(
1446 protocol,
1447 schema,
1448 grammar,
1449 child_id,
1450 production,
1451 &mut cursor,
1452 out,
1453 )
1454 }
1455 }
1456}
1457
1458fn pick_choice_with_cursor<'a>(
1459 schema: &Schema,
1460 grammar: &Grammar,
1461 vertex_id: &panproto_gat::Name,
1462 cursor: &ChildCursor<'_>,
1463 alternatives: &'a [Production],
1464) -> Option<&'a Production> {
1465 // Discriminator-driven dispatch (highest priority): when the
1466 // walker recorded a `chose-alt-fingerprint` constraint at parse
1467 // time, dispatch directly against that. This is the categorical
1468 // discriminator: it survives stripping of byte-position
1469 // constraints (so by-construction round-trips work) and is the
1470 // explicit witness of which CHOICE alternative the parser took.
1471 //
1472 // Falls back to the live `interstitial-*` substring blob when no
1473 // fingerprint is present (e.g. instances built by callers that
1474 // bypass the AstWalker). Both blobs are scored by the longest
1475 // STRING-literal token in an alternative that matches; the
1476 // length tiebreak prefers `&&` over `&`, `==` over `=`, etc.
1477 let constraint_blob = schema
1478 .constraints
1479 .get(vertex_id)
1480 .map(|cs| {
1481 let fingerprint: Option<&str> = cs
1482 .iter()
1483 .find(|c| c.sort.as_ref() == "chose-alt-fingerprint")
1484 .map(|c| c.value.as_str());
1485 if let Some(fp) = fingerprint {
1486 fp.to_owned()
1487 } else {
1488 cs.iter()
1489 .filter(|c| {
1490 let s = c.sort.as_ref();
1491 s.starts_with("interstitial-") && !s.ends_with("-start-byte")
1492 })
1493 .map(|c| c.value.as_str())
1494 .collect::<Vec<&str>>()
1495 .join(" ")
1496 }
1497 })
1498 .unwrap_or_default();
1499 let child_kinds: Vec<&str> = schema
1500 .constraints
1501 .get(vertex_id)
1502 .and_then(|cs| {
1503 cs.iter()
1504 .find(|c| c.sort.as_ref() == "chose-alt-child-kinds")
1505 .map(|c| c.value.split_whitespace().collect())
1506 })
1507 .unwrap_or_default();
1508 if !constraint_blob.is_empty() {
1509 // Primary score: literal-token match length. This dominates
1510 // alt selection so existing language tests that depend on
1511 // literal-only fingerprints keep working.
1512 // Secondary score (tiebreaker only): named-symbol kind match
1513 // count, read from the separate `chose-alt-child-kinds`
1514 // constraint (kept apart from the literal fingerprint so
1515 // identifiers like `:` in the kind list don't contaminate the
1516 // literal match). An alt that matches the recorded kinds is a
1517 // stronger witness than one whose only
1518 // overlap is literal punctuation.
1519 let mut best_literal: usize = 0;
1520 let mut best_symbols: usize = 0;
1521 let mut best_alt: Option<&Production> = None;
1522 let mut tied = false;
1523 for alt in alternatives {
1524 let strings = literal_strings(alt);
1525 if strings.is_empty() {
1526 continue;
1527 }
1528 let literal_score = strings
1529 .iter()
1530 .filter(|s| constraint_blob.contains(s.as_str()))
1531 .map(String::len)
1532 .sum::<usize>();
1533 if literal_score == 0 {
1534 continue;
1535 }
1536 // Symbol score is computed only as a tiebreaker among alts
1537 // whose literal-token coverage is the same; it never lifts
1538 // an alt above one with a strictly higher literal score.
1539 // Reads the `chose-alt-child-kinds` constraint (a separate
1540 // sequence the walker emits, kept apart from the literal
1541 // fingerprint to avoid cross-contamination).
1542 let symbol_score = if literal_score >= best_literal && !child_kinds.is_empty() {
1543 let symbols = referenced_symbols(alt);
1544 symbols
1545 .iter()
1546 .filter(|sym| {
1547 let sym_str: &str = sym;
1548 if child_kinds.contains(&sym_str) {
1549 return true;
1550 }
1551 grammar.subtypes.get(sym_str).is_some_and(|sub_set| {
1552 sub_set
1553 .iter()
1554 .any(|sub| child_kinds.contains(&sub.as_str()))
1555 })
1556 })
1557 .count()
1558 } else {
1559 0
1560 };
1561 let better = literal_score > best_literal
1562 || (literal_score == best_literal && symbol_score > best_symbols);
1563 let same = literal_score == best_literal && symbol_score == best_symbols;
1564 if better {
1565 best_literal = literal_score;
1566 best_symbols = symbol_score;
1567 best_alt = Some(alt);
1568 tied = false;
1569 } else if same && best_alt.is_some() {
1570 tied = true;
1571 }
1572 }
1573 // Only commit to an alt when the fingerprint discriminates it
1574 // uniquely. A tie means the alts share the same literal token
1575 // set (e.g. JSON's `string = CHOICE { SEQ { '"', '"' }, SEQ {
1576 // '"', _string_content, '"' } }` — both alts contain just the
1577 // two `"` tokens). In that case fall through to cursor-based
1578 // dispatch, which uses the actual edge structure.
1579 if let Some(alt) = best_alt {
1580 if !tied {
1581 return Some(alt);
1582 }
1583 }
1584 }
1585
1586 // Cursor-driven dispatch: pick the alternative whose body
1587 // references a SYMBOL covering the *first unconsumed* edge in
1588 // cursor order. `referenced_symbols` walks the alternative
1589 // recursively (across nested SEQs, REPEATs, OPTIONALs, FIELDs,
1590 // etc.) so a leading optional like `attribute_item` does not
1591 // block matching when only the trailing required symbol is
1592 // present on the schema.
1593 //
1594 // Ordering by the first unconsumed edge (rather than picking any
1595 // alternative whose SYMBOL set intersects the unconsumed
1596 // multiset) is what preserves schema edge order under
1597 // REPEAT(CHOICE(...)) productions. Without this rule, alt order
1598 // in the grammar's CHOICE determines the emission order, and a
1599 // schema with interleaved kinds like `[symbol, punct, int,
1600 // symbol, punct, int]` re-fuses to `[symbol, symbol, punct,
1601 // punct, int, int]` when emitted then re-parsed. The fix is the
1602 // categorical reading of REPEAT-over-list (list-shaped fold)
1603 // rather than REPEAT-over-multiset (unordered fold).
1604 let first_unconsumed_kind: Option<&str> = cursor
1605 .edges
1606 .iter()
1607 .enumerate()
1608 .find(|(i, _)| !cursor.consumed[*i])
1609 .and_then(|(_, edge)| schema.vertices.get(&edge.tgt).map(|v| v.kind.as_ref()));
1610 if let Some(target_kind) = first_unconsumed_kind {
1611 for alt in alternatives {
1612 let symbols = referenced_symbols(alt);
1613 if !symbols.is_empty()
1614 && symbols
1615 .iter()
1616 .any(|s| kind_satisfies_symbol(grammar, Some(target_kind), s))
1617 {
1618 return Some(alt);
1619 }
1620 }
1621 }
1622
1623 // FIELD dispatch: pick an alternative whose FIELD name matches an
1624 // unconsumed edge kind.
1625 let edge_kinds: Vec<&str> = cursor
1626 .edges
1627 .iter()
1628 .enumerate()
1629 .filter(|(i, _)| !cursor.consumed[*i])
1630 .map(|(_, e)| e.kind.as_ref())
1631 .collect();
1632 for alt in alternatives {
1633 if has_field_in(alt, &edge_kinds) {
1634 return Some(alt);
1635 }
1636 }
1637
1638 // No cursor-driven match. Fall back to:
1639 //
1640 // - BLANK (the explicit empty alternative) when present, so an
1641 // OPTIONAL-shaped CHOICE compiles to nothing.
1642 // - The first non-`BLANK` alternative as a last resort, used by
1643 // STRING-only alternatives (keyword tokens) and other choices
1644 // that don't reach the cursor.
1645 //
1646 // The previous "match own_kind" branch is intentionally absent:
1647 // when an alt's first SYMBOL equals the current vertex's kind, the
1648 // caller is already emitting that vertex's own rule. Recursing
1649 // into the alt would cause a self-loop in the rule walk.
1650 let _ = (schema, vertex_id);
1651 if alternatives.iter().any(|a| matches!(a, Production::Blank)) {
1652 return alternatives.iter().find(|a| matches!(a, Production::Blank));
1653 }
1654 alternatives
1655 .iter()
1656 .find(|alt| !matches!(alt, Production::Blank))
1657}
1658
1659/// Collect every literal STRING token directly inside `production`
1660/// (without descending into SYMBOLs / hidden rules). Used to score
1661/// CHOICE alternatives against the parent vertex's interstitials so
1662/// the right operator / keyword form is picked when the schema
1663/// preserves interstitial fragments from a prior parse.
1664fn literal_strings(production: &Production) -> Vec<String> {
1665 let mut out = Vec::new();
1666 fn walk(p: &Production, out: &mut Vec<String>) {
1667 match p {
1668 Production::String { value } if !value.is_empty() => {
1669 out.push(value.clone());
1670 }
1671 Production::Choice { members } | Production::Seq { members } => {
1672 for m in members {
1673 walk(m, out);
1674 }
1675 }
1676 Production::Repeat { content }
1677 | Production::Repeat1 { content }
1678 | Production::Optional { content }
1679 | Production::Field { content, .. }
1680 | Production::Alias { content, .. }
1681 | Production::Token { content }
1682 | Production::ImmediateToken { content }
1683 | Production::Prec { content, .. }
1684 | Production::PrecLeft { content, .. }
1685 | Production::PrecRight { content, .. }
1686 | Production::PrecDynamic { content, .. }
1687 | Production::Reserved { content, .. } => walk(content, out),
1688 _ => {}
1689 }
1690 }
1691 walk(production, &mut out);
1692 out
1693}
1694
1695/// Collect every SYMBOL name reachable from `production` without
1696/// crossing into nested rules. Used by `pick_choice_with_cursor` to
1697/// rank alternatives by "any SYMBOL inside this alt matches something
1698/// on the cursor", instead of just the first SYMBOL: a leading
1699/// optional like `attribute_item` then `parameter` is otherwise
1700/// rejected when only the parameter children are present.
1701fn referenced_symbols(production: &Production) -> Vec<&str> {
1702 let mut out = Vec::new();
1703 fn walk<'a>(p: &'a Production, out: &mut Vec<&'a str>) {
1704 match p {
1705 Production::Symbol { name } => out.push(name.as_str()),
1706 Production::Choice { members } | Production::Seq { members } => {
1707 for m in members {
1708 walk(m, out);
1709 }
1710 }
1711 Production::Alias {
1712 content,
1713 named,
1714 value,
1715 } => {
1716 // A named ALIAS produces a child vertex whose kind is
1717 // the alias `value` (e.g. `ALIAS { content: STRING "=",
1718 // value: "punctuation", named: true }` introduces a
1719 // `punctuation` child). For cursor-driven dispatch to
1720 // recognise alts that emit such children, yield the
1721 // alias value as a referenced symbol. Anonymous aliases
1722 // do not introduce a named node and only need their
1723 // inner content's symbols.
1724 if *named && !value.is_empty() {
1725 out.push(value.as_str());
1726 }
1727 walk(content, out);
1728 }
1729 Production::Repeat { content }
1730 | Production::Repeat1 { content }
1731 | Production::Optional { content }
1732 | Production::Field { content, .. }
1733 | Production::Token { content }
1734 | Production::ImmediateToken { content }
1735 | Production::Prec { content, .. }
1736 | Production::PrecLeft { content, .. }
1737 | Production::PrecRight { content, .. }
1738 | Production::PrecDynamic { content, .. }
1739 | Production::Reserved { content, .. } => walk(content, out),
1740 _ => {}
1741 }
1742 }
1743 walk(production, &mut out);
1744 out
1745}
1746
1747#[cfg(test)]
1748fn first_symbol(production: &Production) -> Option<&str> {
1749 match production {
1750 Production::Symbol { name } => Some(name),
1751 Production::Seq { members } => members.iter().find_map(first_symbol),
1752 Production::Choice { members } => members.iter().find_map(first_symbol),
1753 Production::Repeat { content }
1754 | Production::Repeat1 { content }
1755 | Production::Optional { content }
1756 | Production::Field { content, .. }
1757 | Production::Alias { content, .. }
1758 | Production::Token { content }
1759 | Production::ImmediateToken { content }
1760 | Production::Prec { content, .. }
1761 | Production::PrecLeft { content, .. }
1762 | Production::PrecRight { content, .. }
1763 | Production::PrecDynamic { content, .. }
1764 | Production::Reserved { content, .. } => first_symbol(content),
1765 _ => None,
1766 }
1767}
1768
1769fn has_field_in(production: &Production, edge_kinds: &[&str]) -> bool {
1770 match production {
1771 Production::Field { name, .. } => edge_kinds.contains(&name.as_str()),
1772 Production::Seq { members } | Production::Choice { members } => {
1773 members.iter().any(|m| has_field_in(m, edge_kinds))
1774 }
1775 Production::Repeat { content }
1776 | Production::Repeat1 { content }
1777 | Production::Optional { content }
1778 | Production::Alias { content, .. }
1779 | Production::Token { content }
1780 | Production::ImmediateToken { content }
1781 | Production::Prec { content, .. }
1782 | Production::PrecLeft { content, .. }
1783 | Production::PrecRight { content, .. }
1784 | Production::PrecDynamic { content, .. }
1785 | Production::Reserved { content, .. } => has_field_in(content, edge_kinds),
1786 _ => false,
1787 }
1788}
1789
1790fn has_relevant_constraint(
1791 production: &Production,
1792 schema: &Schema,
1793 vertex_id: &panproto_gat::Name,
1794) -> bool {
1795 let constraints = match schema.constraints.get(vertex_id) {
1796 Some(c) => c,
1797 None => return false,
1798 };
1799 fn walk(production: &Production, constraints: &[panproto_schema::Constraint]) -> bool {
1800 match production {
1801 Production::String { value } => constraints
1802 .iter()
1803 .any(|c| c.value == *value || c.sort.as_ref() == value),
1804 Production::Field { name, content } => {
1805 constraints.iter().any(|c| c.sort.as_ref() == name) || walk(content, constraints)
1806 }
1807 Production::Seq { members } | Production::Choice { members } => {
1808 members.iter().any(|m| walk(m, constraints))
1809 }
1810 Production::Repeat { content }
1811 | Production::Repeat1 { content }
1812 | Production::Optional { content }
1813 | Production::Alias { content, .. }
1814 | Production::Token { content }
1815 | Production::ImmediateToken { content }
1816 | Production::Prec { content, .. }
1817 | Production::PrecLeft { content, .. }
1818 | Production::PrecRight { content, .. }
1819 | Production::PrecDynamic { content, .. }
1820 | Production::Reserved { content, .. } => walk(content, constraints),
1821 _ => false,
1822 }
1823 }
1824 walk(production, constraints)
1825}
1826
1827fn children_for<'a>(schema: &'a Schema, vertex_id: &panproto_gat::Name) -> Vec<&'a Edge> {
1828 // Walk `outgoing` (insertion-ordered by SchemaBuilder via SmallVec
1829 // append) rather than the unordered `edges` HashMap so abstract
1830 // schemas under REPEAT(CHOICE(...)) preserve the order their edges
1831 // were inserted in. The previous implementation walked the HashMap
1832 // and sorted lexicographically by (kind, target id), which fused
1833 // interleaved children of the same kind into runs (e.g. a sequence
1834 // [symbol, punct, int, symbol, punct, int] became [symbol, symbol,
1835 // punct, punct, int, int] after the lex sort).
1836 let Some(edges) = schema.outgoing.get(vertex_id) else {
1837 return Vec::new();
1838 };
1839
1840 // Look up the canonical Edge reference (the key in `schema.edges`)
1841 // for each entry in `outgoing`. Falls back to the SmallVec entry if
1842 // the canonical key is missing, which would indicate index drift.
1843 let mut indexed: Vec<(usize, u32, &Edge)> = edges
1844 .iter()
1845 .enumerate()
1846 .map(|(i, e)| {
1847 let canonical = schema.edges.get_key_value(e).map_or(e, |(k, _)| k);
1848 let pos = schema.orderings.get(canonical).copied().unwrap_or(u32::MAX);
1849 (i, pos, canonical)
1850 })
1851 .collect();
1852
1853 // Stable sort by (explicit-ordering, insertion-index). Edges with
1854 // an explicit `orderings` entry come first in their declared order;
1855 // the remainder fall through in insertion order.
1856 indexed.sort_by_key(|(i, pos, _)| (*pos, *i));
1857 indexed.into_iter().map(|(_, _, e)| e).collect()
1858}
1859
1860fn vertex_id_kind<'a>(schema: &'a Schema, vertex_id: &panproto_gat::Name) -> Option<&'a str> {
1861 schema.vertices.get(vertex_id).map(|v| v.kind.as_ref())
1862}
1863
1864fn literal_value<'a>(schema: &'a Schema, vertex_id: &panproto_gat::Name) -> Option<&'a str> {
1865 schema
1866 .constraints
1867 .get(vertex_id)?
1868 .iter()
1869 .find(|c| c.sort.as_ref() == "literal-value")
1870 .map(|c| c.value.as_str())
1871}
1872
1873/// True iff `pattern` matches a (possibly optional / repeated) sequence
1874/// of carriage-return and newline characters only. Examples: `\r?\n`,
1875/// `\n`, `\r\n`, `\n+`, `\r?\n+`. Distinguishes structural newline
1876/// terminals from generic whitespace and from other patterns that
1877/// happen to contain a newline escape inside a larger class.
1878fn is_newline_like_pattern(pattern: &str) -> bool {
1879 if pattern.is_empty() {
1880 return false;
1881 }
1882 let mut chars = pattern.chars();
1883 let mut saw_newline_atom = false;
1884 while let Some(c) = chars.next() {
1885 match c {
1886 '\\' => match chars.next() {
1887 Some('n' | 'r') => saw_newline_atom = true,
1888 _ => return false,
1889 },
1890 '?' | '*' | '+' => {} // quantifiers on the previous atom
1891 _ => return false,
1892 }
1893 }
1894 saw_newline_atom
1895}
1896
1897/// True iff `pattern` matches a (possibly quantified) run of generic
1898/// whitespace characters: `\s+`, `[ \t]+`, ` +`, `\s*`. Such patterns
1899/// describe interstitial spacing rather than syntactic content, so the
1900/// pretty emitter can drop them and let the layout pass insert the
1901/// configured separator.
1902fn is_whitespace_only_pattern(pattern: &str) -> bool {
1903 if pattern.is_empty() {
1904 return false;
1905 }
1906 // Strip an outer quantifier suffix.
1907 let trimmed = pattern.trim_end_matches(['?', '*', '+']);
1908 if trimmed.is_empty() {
1909 return false;
1910 }
1911 // Bare `\s` / ` ` / `\t`.
1912 if matches!(trimmed, "\\s" | " " | "\\t") {
1913 return true;
1914 }
1915 // Character class containing only whitespace atoms.
1916 if let Some(inner) = trimmed.strip_prefix('[').and_then(|s| s.strip_suffix(']')) {
1917 let mut chars = inner.chars();
1918 let mut saw_atom = false;
1919 while let Some(c) = chars.next() {
1920 match c {
1921 '\\' => match chars.next() {
1922 Some('s' | 't' | 'r' | 'n') => saw_atom = true,
1923 _ => return false,
1924 },
1925 ' ' | '\t' => saw_atom = true,
1926 _ => return false,
1927 }
1928 }
1929 return saw_atom;
1930 }
1931 false
1932}
1933
1934fn placeholder_for_pattern(pattern: &str) -> String {
1935 // Heuristic placeholder for unconstrained PATTERN terminals.
1936 //
1937 // First handle the "the regex IS a literal escape" cases that
1938 // tree-sitter grammars use as separators (`\n`, `\r\n`, `;`,
1939 // etc.); emitting the matching character is always preferable
1940 // to a `_x` identifier-like placeholder when the surrounding
1941 // grammar expects a separator.
1942 let simple_lit = decode_simple_pattern_literal(pattern);
1943 if let Some(lit) = simple_lit {
1944 return lit;
1945 }
1946
1947 if pattern.contains("[0-9]") || pattern.contains("\\d") {
1948 "0".into()
1949 } else if pattern.contains("[a-zA-Z_]") || pattern.contains("\\w") {
1950 "_x".into()
1951 } else if pattern.contains('"') || pattern.contains('\'') {
1952 "\"\"".into()
1953 } else {
1954 "_".into()
1955 }
1956}
1957
1958/// Decode a tree-sitter PATTERN whose regex is a simple literal
1959/// (newline, semicolon, comma, etc.) to the byte sequence it matches.
1960/// Returns `None` for patterns with character classes, alternations,
1961/// or quantifiers; the caller falls back to the heuristic placeholder.
1962fn decode_simple_pattern_literal(pattern: &str) -> Option<String> {
1963 // Skip patterns containing regex metachars that would broaden the
1964 // match beyond a single literal byte sequence.
1965 if pattern
1966 .chars()
1967 .any(|c| matches!(c, '[' | ']' | '(' | ')' | '*' | '+' | '?' | '|' | '{' | '}'))
1968 {
1969 return None;
1970 }
1971 let mut out = String::new();
1972 let mut chars = pattern.chars();
1973 while let Some(c) = chars.next() {
1974 if c == '\\' {
1975 match chars.next() {
1976 Some('n') => out.push('\n'),
1977 Some('r') => out.push('\r'),
1978 Some('t') => out.push('\t'),
1979 Some('\\') => out.push('\\'),
1980 Some('/') => out.push('/'),
1981 Some(other) => out.push(other),
1982 None => return None,
1983 }
1984 } else {
1985 out.push(c);
1986 }
1987 }
1988 Some(out)
1989}
1990
1991// ═══════════════════════════════════════════════════════════════════
1992// Token list output with Spacing algebra
1993// ═══════════════════════════════════════════════════════════════════
1994//
1995// Emit produces a free monoid over `Token`. Layout (spaces, newlines,
1996// indentation) is a homomorphism `Vec<Token> -> Vec<u8>` parameterised
1997// by `FormatPolicy`. Separating the structural output from the layout
1998// decision means each phase has one job: emit walks the grammar and
1999// pushes tokens; layout is a single fold, locally driven by adjacent
2000// pairs and a depth counter. Snapshot/restore is just `tokens.len()`.
2001
2002#[derive(Clone)]
2003enum Token {
2004 /// A user-visible terminal contributed by the grammar.
2005 Lit(String),
2006 /// `indent_open` marker emitted when a `Lit` matched the policy's
2007 /// open list. Carried as a separate token so layout can decide to
2008 /// break + indent without re-scanning.
2009 IndentOpen,
2010 /// `indent_close` marker emitted before a closer-`Lit`.
2011 IndentClose,
2012 /// "Break a line here if not already at line start" — used after
2013 /// statements/declarations and after open braces.
2014 LineBreak,
2015 /// Suppress the next inter-Lit separator. Pushed by the REPEAT
2016 /// walker when an iteration's "separator slot" (a CHOICE-with-BLANK
2017 /// or OPTIONAL at SEQ position 0) emitted zero content tokens, so
2018 /// the categorical reading is "no source-level separator existed
2019 /// between these two sibling iterations of the body".
2020 NoSpace,
2021}
2022
2023struct Output<'a> {
2024 tokens: Vec<Token>,
2025 policy: &'a FormatPolicy,
2026}
2027
2028#[derive(Clone)]
2029struct OutputSnapshot {
2030 tokens_len: usize,
2031}
2032
2033impl<'a> Output<'a> {
2034 fn new(policy: &'a FormatPolicy) -> Self {
2035 Self {
2036 tokens: Vec::new(),
2037 policy,
2038 }
2039 }
2040
2041 fn token(&mut self, value: &str) {
2042 if value.is_empty() {
2043 return;
2044 }
2045
2046 // A grammar STRING whose value is a newline (e.g. abc's `_NL = "\n"`
2047 // or any rule that uses `"\n"` as a structural line terminator)
2048 // must route through the layout's `LineBreak` channel. Emitting it
2049 // as a `Lit` leaves the newline character in the byte stream but
2050 // also makes `needs_space_between` insert the configured separator
2051 // between the newline and the following token, producing leading
2052 // spaces on every line after the first.
2053 if value == "\n" || value == "\r\n" || value == "\r" {
2054 self.tokens.push(Token::LineBreak);
2055 return;
2056 }
2057
2058 if self.policy.indent_close.iter().any(|t| t == value) {
2059 self.tokens.push(Token::IndentClose);
2060 }
2061
2062 self.tokens.push(Token::Lit(value.to_owned()));
2063
2064 if self.policy.indent_open.iter().any(|t| t == value) {
2065 self.tokens.push(Token::IndentOpen);
2066 self.tokens.push(Token::LineBreak);
2067 } else if self.policy.line_break_after.iter().any(|t| t == value) {
2068 self.tokens.push(Token::LineBreak);
2069 }
2070 }
2071
2072 fn newline(&mut self) {
2073 self.tokens.push(Token::LineBreak);
2074 }
2075
2076 /// Open an indent scope: subsequent `LineBreak`s render at the
2077 /// new depth until a matching `indent_close` pops it. Used by the
2078 /// external-token fallback to render indent-based grammars'
2079 /// `_indent` scanner outputs.
2080 fn indent_open(&mut self) {
2081 self.tokens.push(Token::IndentOpen);
2082 self.tokens.push(Token::LineBreak);
2083 }
2084
2085 /// Close one indent scope opened by `indent_open`.
2086 fn indent_close(&mut self) {
2087 self.tokens.push(Token::IndentClose);
2088 }
2089
2090 fn snapshot(&self) -> OutputSnapshot {
2091 OutputSnapshot {
2092 tokens_len: self.tokens.len(),
2093 }
2094 }
2095
2096 fn restore(&mut self, snap: OutputSnapshot) {
2097 self.tokens.truncate(snap.tokens_len);
2098 }
2099
2100 /// True iff at least one `Token::Lit` was pushed since `snap`.
2101 /// Control-only emissions (`LineBreak`, `IndentOpen` / `IndentClose`,
2102 /// `NoSpace`) do not count as content. Used by the REPEAT walker
2103 /// to detect that a "separator slot" CHOICE picked its BLANK
2104 /// alternative, so the next iteration's content can be marked
2105 /// tight against the previous iteration's content.
2106 fn lit_emitted_since(&self, snap: OutputSnapshot) -> bool {
2107 self.tokens[snap.tokens_len..]
2108 .iter()
2109 .any(|t| matches!(t, Token::Lit(_)))
2110 }
2111
2112 /// Push a marker that suppresses the next inter-Lit separator the
2113 /// layout pass would otherwise insert. Used to encode "no source-
2114 /// level separator was emitted between these two Lits" without
2115 /// having to make per-grammar adjacency decisions in the layout.
2116 fn no_space(&mut self) {
2117 self.tokens.push(Token::NoSpace);
2118 }
2119
2120 fn finish(self) -> Vec<u8> {
2121 layout(&self.tokens, self.policy)
2122 }
2123}
2124
2125/// Fold a token list into bytes. The algebra:
2126/// * adjacent `Lit`s get a single space iff `needs_space_between(a, b)`,
2127/// * `IndentOpen` / `IndentClose` adjust a depth counter,
2128/// * `LineBreak` writes `\n` if not already at line start, then the
2129/// next `Lit` writes `indent * indent_width` spaces of indent.
2130fn layout(tokens: &[Token], policy: &FormatPolicy) -> Vec<u8> {
2131 let mut bytes = Vec::new();
2132 let mut indent: usize = 0;
2133 let mut at_line_start = true;
2134 let mut last_lit: Option<&str> = None;
2135 // True iff, at the moment `last_lit` was emitted, the cursor was at a
2136 // position where the grammar expects an operand: start of stream / line,
2137 // just after an open paren / bracket / brace, just after a separator like
2138 // `,` or `;`, or just after a binary / assignment operator. Used by
2139 // `needs_space_between` to recognise `last_lit` as a tight unary prefix
2140 // (`f(-1.0)`) rather than a spaced binary operator (`a - b`).
2141 let mut last_was_in_operand_position = true;
2142 let mut expecting_operand = true;
2143 // Set when a `Token::NoSpace` marker is seen; cleared when the next
2144 // Lit consumes it. While set, suppress the policy separator that
2145 // would otherwise be inserted before the next Lit.
2146 let mut suppress_next_separator = false;
2147 let newline = policy.newline.as_bytes();
2148 let separator = policy.separator.as_bytes();
2149
2150 for tok in tokens {
2151 match tok {
2152 Token::IndentOpen => indent += 1,
2153 Token::IndentClose => {
2154 indent = indent.saturating_sub(1);
2155 if !at_line_start {
2156 bytes.extend_from_slice(newline);
2157 at_line_start = true;
2158 expecting_operand = true;
2159 }
2160 }
2161 Token::LineBreak => {
2162 if !at_line_start {
2163 bytes.extend_from_slice(newline);
2164 at_line_start = true;
2165 expecting_operand = true;
2166 }
2167 }
2168 Token::NoSpace => {
2169 suppress_next_separator = true;
2170 }
2171 Token::Lit(value) => {
2172 if at_line_start {
2173 bytes.extend(std::iter::repeat_n(b' ', indent * policy.indent_width));
2174 } else if let Some(prev) = last_lit {
2175 if !suppress_next_separator
2176 && needs_space_between(prev, value, last_was_in_operand_position)
2177 {
2178 bytes.extend_from_slice(separator);
2179 }
2180 }
2181 suppress_next_separator = false;
2182 bytes.extend_from_slice(value.as_bytes());
2183 at_line_start = false;
2184 last_was_in_operand_position = expecting_operand;
2185 expecting_operand = leaves_operand_position(value);
2186 last_lit = Some(value.as_str());
2187 }
2188 }
2189 }
2190
2191 if !at_line_start {
2192 bytes.extend_from_slice(newline);
2193 }
2194 bytes
2195}
2196
2197/// True iff emitting `tok` leaves the cursor in a position where the
2198/// grammar expects an operand next. Operand-introducing tokens are open
2199/// punctuation, separators, and operator-like strings; operand-terminating
2200/// tokens are identifiers, literals, and closing punctuation.
2201fn leaves_operand_position(tok: &str) -> bool {
2202 if tok.is_empty() {
2203 return true;
2204 }
2205 if is_punct_open(tok) {
2206 return true;
2207 }
2208 if matches!(tok, "," | ";") {
2209 return true;
2210 }
2211 if is_punct_close(tok) {
2212 return false;
2213 }
2214 if first_is_alnum_or_underscore(tok) || last_ends_with_alnum(tok) {
2215 return false;
2216 }
2217 // Pure punctuation/operator runs (`=`, `+`, `-`, `<=`, `>>`, …) leave
2218 // the cursor expecting another operand.
2219 true
2220}
2221
2222fn needs_space_between(last: &str, next: &str, expecting_operand: bool) -> bool {
2223 if last.is_empty() || next.is_empty() {
2224 return false;
2225 }
2226 if is_punct_open(last) || is_punct_open(next) {
2227 return false;
2228 }
2229 if is_punct_close(next) {
2230 return false;
2231 }
2232 if is_punct_close(last) && is_punct_punctuation(next) {
2233 return false;
2234 }
2235 if last == "." || next == "." {
2236 return false;
2237 }
2238 // Tight unary prefix: `last` is a sign/logical-not operator emitted
2239 // where the grammar expected an operand, so it glues to `next`.
2240 // `expecting_operand` here means: just before `last` was emitted,
2241 // the cursor expected an operand, which makes `last` a unary prefix.
2242 // Examples: `f(-1.0)`, `[ -2, 3 ]`, `return -x`, `a = !flag`.
2243 if expecting_operand && is_unary_prefix_operator(last) && first_is_operand_start(next) {
2244 return false;
2245 }
2246 if last_is_word_like(last) && first_is_word_like(next) {
2247 return true;
2248 }
2249 if last_ends_with_alnum(last) && first_is_alnum_or_underscore(next) {
2250 return true;
2251 }
2252 // Adjacent operator runs: keep them apart so the lexer doesn't glue
2253 // `>` and `=` into `>=` unintentionally.
2254 true
2255}
2256
2257fn is_unary_prefix_operator(s: &str) -> bool {
2258 matches!(s, "-" | "+" | "!" | "~")
2259}
2260
2261fn first_is_operand_start(s: &str) -> bool {
2262 s.chars()
2263 .next()
2264 .map(|c| c.is_alphanumeric() || c == '_' || c == '.' || c == '(')
2265 .unwrap_or(false)
2266}
2267
2268fn is_punct_open(s: &str) -> bool {
2269 matches!(s, "(" | "[" | "{" | "\"" | "'" | "`")
2270}
2271
2272fn is_punct_close(s: &str) -> bool {
2273 matches!(s, ")" | "]" | "}" | "," | ";" | ":" | "\"" | "'" | "`")
2274}
2275
2276fn is_punct_punctuation(s: &str) -> bool {
2277 matches!(s, "," | ";" | ":" | "." | ")" | "]" | "}")
2278}
2279
2280fn last_is_word_like(s: &str) -> bool {
2281 s.chars()
2282 .next_back()
2283 .map(|c| c.is_alphanumeric() || c == '_')
2284 .unwrap_or(false)
2285}
2286
2287fn first_is_word_like(s: &str) -> bool {
2288 s.chars()
2289 .next()
2290 .map(|c| c.is_alphanumeric() || c == '_')
2291 .unwrap_or(false)
2292}
2293
2294fn last_ends_with_alnum(s: &str) -> bool {
2295 s.chars()
2296 .next_back()
2297 .map(char::is_alphanumeric)
2298 .unwrap_or(false)
2299}
2300
2301fn first_is_alnum_or_underscore(s: &str) -> bool {
2302 s.chars()
2303 .next()
2304 .map(|c| c.is_alphanumeric() || c == '_')
2305 .unwrap_or(false)
2306}
2307
2308#[cfg(test)]
2309mod tests {
2310 use super::*;
2311
2312 #[test]
2313 fn parses_simple_grammar_json() {
2314 let bytes = br#"{
2315 "name": "tiny",
2316 "rules": {
2317 "program": {
2318 "type": "SEQ",
2319 "members": [
2320 {"type": "STRING", "value": "hello"},
2321 {"type": "STRING", "value": ";"}
2322 ]
2323 }
2324 }
2325 }"#;
2326 let g = Grammar::from_bytes("tiny", bytes).expect("valid tiny grammar");
2327 assert!(g.rules.contains_key("program"));
2328 }
2329
2330 #[test]
2331 fn output_emits_punctuation_without_leading_space() {
2332 let policy = FormatPolicy::default();
2333 let mut out = Output::new(&policy);
2334 out.token("foo");
2335 out.token("(");
2336 out.token(")");
2337 out.token(";");
2338 let bytes = out.finish();
2339 let s = std::str::from_utf8(&bytes).expect("ascii output");
2340 assert!(s.starts_with("foo();"), "got {s:?}");
2341 }
2342
2343 #[test]
2344 fn grammar_from_bytes_rejects_malformed_input() {
2345 let result = Grammar::from_bytes("malformed", b"not json");
2346 let err = result.expect_err("malformed bytes must yield Err");
2347 let msg = err.to_string();
2348 assert!(
2349 msg.contains("malformed"),
2350 "error message should name the protocol: {msg:?}"
2351 );
2352 }
2353
2354 #[test]
2355 fn output_indents_after_open_brace() {
2356 let policy = FormatPolicy::default();
2357 let mut out = Output::new(&policy);
2358 out.token("fn");
2359 out.token("foo");
2360 out.token("(");
2361 out.token(")");
2362 out.token("{");
2363 out.token("body");
2364 out.token("}");
2365 let bytes = out.finish();
2366 let s = std::str::from_utf8(&bytes).expect("ascii output");
2367 assert!(s.contains("{\n"), "newline after opening brace: {s:?}");
2368 assert!(s.contains("body"), "body inside block: {s:?}");
2369 assert!(s.ends_with("}\n"), "newline after closing brace: {s:?}");
2370 }
2371
2372 #[test]
2373 fn output_no_space_between_word_and_dot() {
2374 let policy = FormatPolicy::default();
2375 let mut out = Output::new(&policy);
2376 out.token("foo");
2377 out.token(".");
2378 out.token("bar");
2379 let bytes = out.finish();
2380 let s = std::str::from_utf8(&bytes).expect("ascii output");
2381 assert!(s.starts_with("foo.bar"), "no space around dot: {s:?}");
2382 }
2383
2384 #[test]
2385 fn output_snapshot_restore_truncates_bytes() {
2386 let policy = FormatPolicy::default();
2387 let mut out = Output::new(&policy);
2388 out.token("keep");
2389 let snap = out.snapshot();
2390 out.token("drop");
2391 out.token("more");
2392 out.restore(snap);
2393 out.token("after");
2394 let bytes = out.finish();
2395 let s = std::str::from_utf8(&bytes).expect("ascii output");
2396 assert!(s.contains("keep"), "kept token survives: {s:?}");
2397 assert!(s.contains("after"), "post-restore token visible: {s:?}");
2398 assert!(!s.contains("drop"), "rolled-back token removed: {s:?}");
2399 assert!(!s.contains("more"), "rolled-back token removed: {s:?}");
2400 }
2401
2402 #[test]
2403 fn child_cursor_take_field_consumes_once() {
2404 let edges_owned: Vec<Edge> = vec![Edge {
2405 src: panproto_gat::Name::from("p"),
2406 tgt: panproto_gat::Name::from("c"),
2407 kind: panproto_gat::Name::from("name"),
2408 name: None,
2409 }];
2410 let edges: Vec<&Edge> = edges_owned.iter().collect();
2411 let mut cursor = ChildCursor::new(&edges);
2412 let first = cursor.take_field("name");
2413 let second = cursor.take_field("name");
2414 assert!(first.is_some(), "first take returns the edge");
2415 assert!(
2416 second.is_none(),
2417 "second take returns None (already consumed)"
2418 );
2419 }
2420
2421 #[test]
2422 fn child_cursor_take_matching_predicate() {
2423 let edges_owned: Vec<Edge> = vec![
2424 Edge {
2425 src: "p".into(),
2426 tgt: "c1".into(),
2427 kind: "child_of".into(),
2428 name: None,
2429 },
2430 Edge {
2431 src: "p".into(),
2432 tgt: "c2".into(),
2433 kind: "key".into(),
2434 name: None,
2435 },
2436 ];
2437 let edges: Vec<&Edge> = edges_owned.iter().collect();
2438 let mut cursor = ChildCursor::new(&edges);
2439 assert!(cursor.has_matching(|e| e.kind.as_ref() == "key"));
2440 let taken = cursor.take_matching(|e| e.kind.as_ref() == "key");
2441 assert!(taken.is_some());
2442 assert!(
2443 !cursor.has_matching(|e| e.kind.as_ref() == "key"),
2444 "consumed edge no longer matches"
2445 );
2446 assert!(
2447 cursor.has_matching(|e| e.kind.as_ref() == "child_of"),
2448 "the other edge is still available"
2449 );
2450 }
2451
2452 #[test]
2453 fn kind_satisfies_symbol_direct_match() {
2454 let bytes = br#"{
2455 "name": "tiny",
2456 "rules": {
2457 "x": {"type": "STRING", "value": "x"}
2458 }
2459 }"#;
2460 let g = Grammar::from_bytes("tiny", bytes).expect("valid grammar");
2461 assert!(kind_satisfies_symbol(&g, Some("x"), "x"));
2462 assert!(!kind_satisfies_symbol(&g, Some("y"), "x"));
2463 assert!(!kind_satisfies_symbol(&g, None, "x"));
2464 }
2465
2466 #[test]
2467 fn kind_satisfies_symbol_through_hidden_rule() {
2468 let bytes = br#"{
2469 "name": "tiny",
2470 "rules": {
2471 "_value": {
2472 "type": "CHOICE",
2473 "members": [
2474 {"type": "SYMBOL", "name": "object"},
2475 {"type": "SYMBOL", "name": "number"}
2476 ]
2477 },
2478 "object": {"type": "STRING", "value": "{}"},
2479 "number": {"type": "PATTERN", "value": "[0-9]+"}
2480 }
2481 }"#;
2482 let g = Grammar::from_bytes("tiny", bytes).expect("valid grammar");
2483 assert!(
2484 kind_satisfies_symbol(&g, Some("number"), "_value"),
2485 "number is reachable from _value via CHOICE"
2486 );
2487 assert!(
2488 kind_satisfies_symbol(&g, Some("object"), "_value"),
2489 "object is reachable from _value via CHOICE"
2490 );
2491 assert!(
2492 !kind_satisfies_symbol(&g, Some("string"), "_value"),
2493 "string is NOT among the alternatives"
2494 );
2495 }
2496
2497 #[test]
2498 fn first_symbol_skips_string_terminals() {
2499 let prod: Production = serde_json::from_str(
2500 r#"{
2501 "type": "SEQ",
2502 "members": [
2503 {"type": "STRING", "value": "{"},
2504 {"type": "SYMBOL", "name": "body"},
2505 {"type": "STRING", "value": "}"}
2506 ]
2507 }"#,
2508 )
2509 .expect("valid SEQ");
2510 assert_eq!(first_symbol(&prod), Some("body"));
2511 }
2512
2513 #[test]
2514 fn placeholder_for_pattern_routes_by_regex_class() {
2515 assert_eq!(placeholder_for_pattern("[0-9]+"), "0");
2516 assert_eq!(placeholder_for_pattern("[a-zA-Z_]\\w*"), "_x");
2517 assert_eq!(placeholder_for_pattern("\"[^\"]*\""), "\"\"");
2518 assert_eq!(placeholder_for_pattern("\\d+\\.\\d+"), "0");
2519 }
2520
2521 #[test]
2522 fn format_policy_default_breaks_after_semicolon() {
2523 let policy = FormatPolicy::default();
2524 assert!(policy.line_break_after.iter().any(|t| t == ";"));
2525 assert!(policy.indent_open.iter().any(|t| t == "{"));
2526 assert!(policy.indent_close.iter().any(|t| t == "}"));
2527 assert_eq!(policy.indent_width, 2);
2528 }
2529
2530 #[test]
2531 fn placeholder_decodes_literal_pattern_separators() {
2532 // PATTERN regexes that match a single literal byte sequence
2533 // (newline, semicolon, comma) emit the bytes verbatim instead
2534 // of falling through to the `_` catch-all.
2535 assert_eq!(placeholder_for_pattern("\\n"), "\n");
2536 assert_eq!(placeholder_for_pattern("\\r\\n"), "\r\n");
2537 assert_eq!(placeholder_for_pattern(";"), ";");
2538 // Patterns with character classes / alternation still route
2539 // through the heuristic.
2540 assert_eq!(placeholder_for_pattern("[0-9]+"), "0");
2541 assert_eq!(placeholder_for_pattern("a|b"), "_");
2542 }
2543
2544 #[test]
2545 fn supertypes_decode_from_grammar_json_strings() {
2546 // Tree-sitter older grammars list supertypes as bare strings.
2547 let bytes = br#"{
2548 "name": "tiny",
2549 "supertypes": ["expression"],
2550 "rules": {
2551 "expression": {
2552 "type": "CHOICE",
2553 "members": [
2554 {"type": "SYMBOL", "name": "binary_expression"},
2555 {"type": "SYMBOL", "name": "identifier"}
2556 ]
2557 },
2558 "binary_expression": {"type": "STRING", "value": "x"},
2559 "identifier": {"type": "PATTERN", "value": "[a-z]+"}
2560 }
2561 }"#;
2562 let g = Grammar::from_bytes("tiny", bytes).expect("parse");
2563 assert!(g.supertypes.contains("expression"));
2564 // identifier matches the supertype `expression`.
2565 assert!(kind_satisfies_symbol(&g, Some("identifier"), "expression"));
2566 // unrelated kinds do not.
2567 assert!(!kind_satisfies_symbol(&g, Some("string"), "expression"));
2568 }
2569
2570 #[test]
2571 fn supertypes_decode_from_grammar_json_objects() {
2572 // Recent grammars list supertypes as `{type: SYMBOL, name: ...}`
2573 // entries instead of bare strings.
2574 let bytes = br#"{
2575 "name": "tiny",
2576 "supertypes": [{"type": "SYMBOL", "name": "stmt"}],
2577 "rules": {
2578 "stmt": {
2579 "type": "CHOICE",
2580 "members": [
2581 {"type": "SYMBOL", "name": "while_stmt"},
2582 {"type": "SYMBOL", "name": "if_stmt"}
2583 ]
2584 },
2585 "while_stmt": {"type": "STRING", "value": "while"},
2586 "if_stmt": {"type": "STRING", "value": "if"}
2587 }
2588 }"#;
2589 let g = Grammar::from_bytes("tiny", bytes).expect("parse");
2590 assert!(g.supertypes.contains("stmt"));
2591 assert!(kind_satisfies_symbol(&g, Some("while_stmt"), "stmt"));
2592 }
2593
2594 #[test]
2595 fn alias_value_matches_kind() {
2596 // A named ALIAS rewrites the parser-visible kind to `value`;
2597 // `kind_satisfies_symbol` should accept that rewritten kind
2598 // when looking up the original SYMBOL.
2599 let bytes = br#"{
2600 "name": "tiny",
2601 "rules": {
2602 "_package_identifier": {
2603 "type": "ALIAS",
2604 "named": true,
2605 "value": "package_identifier",
2606 "content": {"type": "SYMBOL", "name": "identifier"}
2607 },
2608 "identifier": {"type": "PATTERN", "value": "[a-z]+"}
2609 }
2610 }"#;
2611 let g = Grammar::from_bytes("tiny", bytes).expect("parse");
2612 assert!(kind_satisfies_symbol(
2613 &g,
2614 Some("package_identifier"),
2615 "_package_identifier"
2616 ));
2617 }
2618
2619 #[test]
2620 fn referenced_symbols_walks_nested_seq() {
2621 let prod: Production = serde_json::from_str(
2622 r#"{
2623 "type": "SEQ",
2624 "members": [
2625 {"type": "CHOICE", "members": [
2626 {"type": "SYMBOL", "name": "attribute_item"},
2627 {"type": "BLANK"}
2628 ]},
2629 {"type": "SYMBOL", "name": "parameter"},
2630 {"type": "REPEAT", "content": {
2631 "type": "SEQ",
2632 "members": [
2633 {"type": "STRING", "value": ","},
2634 {"type": "SYMBOL", "name": "parameter"}
2635 ]
2636 }}
2637 ]
2638 }"#,
2639 )
2640 .expect("seq");
2641 let symbols = referenced_symbols(&prod);
2642 assert!(symbols.contains(&"attribute_item"));
2643 assert!(symbols.contains(&"parameter"));
2644 }
2645
2646 #[test]
2647 fn literal_strings_collects_choice_members() {
2648 let prod: Production = serde_json::from_str(
2649 r#"{
2650 "type": "CHOICE",
2651 "members": [
2652 {"type": "STRING", "value": "+"},
2653 {"type": "STRING", "value": "-"},
2654 {"type": "STRING", "value": "*"}
2655 ]
2656 }"#,
2657 )
2658 .expect("choice");
2659 let strings = literal_strings(&prod);
2660 assert_eq!(strings, vec!["+", "-", "*"]);
2661 }
2662
2663 /// The ocaml and javascript grammars (tree-sitter ≥ 0.25) emit a
2664 /// `RESERVED` rule kind that an earlier deserialiser rejected
2665 /// with `unknown variant "RESERVED"`. Verify both that the bare
2666 /// variant deserialises and that a `RESERVED`-wrapped grammar is
2667 /// loadable end-to-end via [`Grammar::from_bytes`].
2668 #[test]
2669 fn reserved_variant_deserialises() {
2670 let prod: Production = serde_json::from_str(
2671 r#"{
2672 "type": "RESERVED",
2673 "content": {"type": "SYMBOL", "name": "_lowercase_identifier"},
2674 "context_name": "attribute_id"
2675 }"#,
2676 )
2677 .expect("RESERVED parses");
2678 match prod {
2679 Production::Reserved { content, .. } => match *content {
2680 Production::Symbol { name } => assert_eq!(name, "_lowercase_identifier"),
2681 other => panic!("expected inner SYMBOL, got {other:?}"),
2682 },
2683 other => panic!("expected RESERVED, got {other:?}"),
2684 }
2685 }
2686
2687 #[test]
2688 fn reserved_grammar_loads_end_to_end() {
2689 let bytes = br#"{
2690 "name": "tiny_reserved",
2691 "rules": {
2692 "program": {
2693 "type": "RESERVED",
2694 "content": {"type": "SYMBOL", "name": "ident"},
2695 "context_name": "keywords"
2696 },
2697 "ident": {"type": "PATTERN", "value": "[a-z]+"}
2698 }
2699 }"#;
2700 let g = Grammar::from_bytes("tiny_reserved", bytes).expect("RESERVED-using grammar loads");
2701 assert!(g.rules.contains_key("program"));
2702 }
2703
2704 #[test]
2705 fn reserved_walker_helpers_recurse_into_content() {
2706 // The walker's helpers (first_symbol, has_field_in,
2707 // referenced_symbols, ...) all need to descend through
2708 // RESERVED into its content. If they bail at RESERVED, the
2709 // `pick_choice_with_cursor` heuristic ranks the alt below
2710 // alts that DO recurse, which produces wrong emit output
2711 // even when the deserialiser doesn't crash.
2712 let prod: Production = serde_json::from_str(
2713 r#"{
2714 "type": "RESERVED",
2715 "content": {
2716 "type": "FIELD",
2717 "name": "lhs",
2718 "content": {"type": "SYMBOL", "name": "expr"}
2719 },
2720 "context_name": "ctx"
2721 }"#,
2722 )
2723 .expect("nested RESERVED parses");
2724 assert_eq!(first_symbol(&prod), Some("expr"));
2725 assert!(has_field_in(&prod, &["lhs"]));
2726 let symbols = referenced_symbols(&prod);
2727 assert!(symbols.contains(&"expr"));
2728 }
2729}