Skip to main content

panproto_parse/
walker.rs

1//! Generic tree-sitter AST walker that converts parse trees to panproto schemas.
2//!
3//! Because theories are auto-derived from the grammar, the walker is fully generic:
4//! one implementation works for all languages. The node's `kind()` IS the panproto
5//! vertex kind; the field name IS the edge kind.
6//!
7//! Named-scope detection (functions, classes, methods, modules, types) is driven
8//! by the grammar's `queries/tags.scm` file via [`ScopeDetector`], not by a
9//! hardcoded node-kind list. This makes scope detection uniformly correct across
10//! every tree-sitter grammar that ships a tags query. See the [`scope_detector`]
11//! module for the full rationale.
12//!
13//! [`scope_detector`]: crate::scope_detector
14//! [`ScopeDetector`]: crate::scope_detector::ScopeDetector
15
16use std::collections::BTreeMap;
17
18use panproto_schema::{Protocol, Schema, SchemaBuilder};
19use rustc_hash::FxHashSet;
20
21use crate::error::ParseError;
22use crate::id_scheme::IdGenerator;
23use crate::scope_detector::{NamedScope, ScopeDetector};
24use crate::theory_extract::ExtractedTheoryMeta;
25
26/// Nodes whose kind names suggest they contain ordered statement sequences.
27///
28/// Unlike scope detection (which is grammar-driven via `tags.scm`), block
29/// grouping is a structural concern: we want sibling statements inside a
30/// block to get positional IDs (`$0`, `$1`, ...) so insertions don't
31/// cascade. Per-language [`WalkerConfig`] overrides extend this list.
32const BLOCK_KINDS: &[&str] = &[
33    "block",
34    "statement_block",
35    "compound_statement",
36    "declaration_list",
37    "field_declaration_list",
38    "enum_body",
39    "class_body",
40    "interface_body",
41    "module_body",
42];
43
44/// Configuration for the walker, allowing per-language customization.
45#[derive(Debug, Clone, Default)]
46pub struct WalkerConfig {
47    /// Additional node kinds that contain ordered statement sequences.
48    ///
49    /// Named-scope detection is handled by [`ScopeDetector`] from the
50    /// grammar's `tags.scm`; no per-language scope configuration is
51    /// required here.
52    ///
53    /// [`ScopeDetector`]: crate::scope_detector::ScopeDetector
54    pub extra_block_kinds: Vec<String>,
55    /// Whether to capture comment nodes as constraints on the following sibling.
56    pub capture_comments: bool,
57    /// Whether to capture whitespace/formatting as constraints.
58    pub capture_formatting: bool,
59}
60
61impl WalkerConfig {
62    /// Construct a config with formatting and comment capture enabled (the
63    /// common default; [`WalkerConfig::default`] returns all-false).
64    #[must_use]
65    pub const fn standard() -> Self {
66        Self {
67            extra_block_kinds: Vec::new(),
68            capture_comments: true,
69            capture_formatting: true,
70        }
71    }
72}
73
74/// Generic AST walker that converts a tree-sitter parse tree to a panproto [`Schema`].
75///
76/// The walker uses the auto-derived theory to determine vertex and edge kinds directly
77/// from the tree-sitter AST, requiring no manual mapping table. Named-scope identity
78/// (the part of the vertex ID that survives insertions) is driven by [`ScopeDetector`]
79/// from the grammar's `tags.scm` query.
80///
81/// [`ScopeDetector`]: crate::scope_detector::ScopeDetector
82pub struct AstWalker<'a> {
83    /// The source code bytes (needed for extracting text of leaf nodes).
84    source: &'a [u8],
85    /// The auto-derived theory metadata. The `vertex_kinds` set is used to
86    /// filter anonymous/internal tree-sitter nodes that are not part of the
87    /// language's public grammar.
88    theory_meta: &'a ExtractedTheoryMeta,
89    /// The protocol definition (for `SchemaBuilder` validation).
90    protocol: &'a Protocol,
91    /// Per-language configuration.
92    config: WalkerConfig,
93    /// Known block kinds (merged from defaults + config).
94    block_kinds: FxHashSet<String>,
95    /// Named scopes indexed by `(start_byte, end_byte)` for O(log n) lookup
96    /// during the tree walk. Derived from a [`ScopeDetector`] run over the
97    /// full source before the walk begins.
98    scope_map: BTreeMap<(usize, usize), NamedScope>,
99}
100
101impl<'a> AstWalker<'a> {
102    /// Create a new walker for the given source, theory, and protocol.
103    ///
104    /// Runs an optional [`ScopeDetector`] over the source to build a
105    /// per-file scope map. Pass `None` to disable named-scope detection
106    /// (every non-root vertex gets a positional ID). Pass `Some(detector)`
107    /// whose [`has_query`] is `false` for the same effect; the detector
108    /// short-circuits to an empty scope list.
109    ///
110    /// [`ScopeDetector`]: crate::scope_detector::ScopeDetector
111    /// [`has_query`]: crate::scope_detector::ScopeDetector::has_query
112    #[must_use]
113    pub fn new(
114        source: &'a [u8],
115        theory_meta: &'a ExtractedTheoryMeta,
116        protocol: &'a Protocol,
117        config: WalkerConfig,
118        scope_detector: Option<&mut ScopeDetector>,
119    ) -> Self {
120        let mut block_kinds: FxHashSet<String> =
121            BLOCK_KINDS.iter().map(|s| (*s).to_owned()).collect();
122        for kind in &config.extra_block_kinds {
123            block_kinds.insert(kind.clone());
124        }
125
126        let mut scope_map: BTreeMap<(usize, usize), NamedScope> = BTreeMap::new();
127        if let Some(det) = scope_detector {
128            for scope in det.scopes(source) {
129                scope_map.insert((scope.node_range.start, scope.node_range.end), scope);
130            }
131        }
132
133        Self {
134            source,
135            theory_meta,
136            protocol,
137            config,
138            block_kinds,
139            scope_map,
140        }
141    }
142
143    /// Walk the entire parse tree and produce a [`Schema`].
144    ///
145    /// # Errors
146    ///
147    /// Returns [`ParseError::SchemaConstruction`] if schema building fails.
148    pub fn walk(&self, tree: &tree_sitter::Tree, file_path: &str) -> Result<Schema, ParseError> {
149        let mut id_gen = IdGenerator::new(file_path);
150        let builder = SchemaBuilder::new(self.protocol);
151        let root = tree.root_node();
152
153        let builder = self.walk_node(root, builder, &mut id_gen, None)?;
154
155        builder.build().map_err(|e| ParseError::SchemaConstruction {
156            reason: e.to_string(),
157        })
158    }
159
160    /// Look up a node's named-scope entry, if any.
161    fn scope_for(&self, node: tree_sitter::Node<'_>) -> Option<&NamedScope> {
162        self.scope_map.get(&(node.start_byte(), node.end_byte()))
163    }
164
165    /// Recursively walk a single node, emitting vertices and edges.
166    fn walk_node(
167        &self,
168        node: tree_sitter::Node<'_>,
169        mut builder: SchemaBuilder,
170        id_gen: &mut IdGenerator,
171        parent_vertex_id: Option<&str>,
172    ) -> Result<SchemaBuilder, ParseError> {
173        // Skip anonymous tokens (punctuation, keywords like `{`, `}`, `,`, etc.).
174        if !node.is_named() {
175            return Ok(builder);
176        }
177
178        let kind = node.kind();
179
180        // Skip the root "program"/"source_file"/"module" wrapper if it just wraps children.
181        // We still process it to emit its children, but do so by iterating directly.
182        let is_root_wrapper = parent_vertex_id.is_none()
183            && (kind == "program"
184                || kind == "source_file"
185                || kind == "module"
186                || kind == "translation_unit");
187
188        let named_scope = if is_root_wrapper {
189            None
190        } else {
191            self.scope_for(node)
192        };
193
194        // Determine vertex ID.
195        //
196        // For a node that is *both* named-scope and scope-introducing
197        // (the common case: a `function_definition`, `class_definition`,
198        // `module`, etc.) we must disambiguate the name exactly once
199        // and reuse the same disambiguated leaf for both the vertex ID
200        // here and the scope-stack frame pushed below. The
201        // `record_name` / `push_recorded_scope` split on `IdGenerator`
202        // makes that explicit: `record_name` is the side-effecting
203        // step that bumps the parent frame's `seen` counter, the leaf
204        // it returns becomes the trailing component of the vertex ID,
205        // and `push_recorded_scope` later takes the same leaf without
206        // re-recording.
207        let (vertex_id, recorded_named_leaf) = if is_root_wrapper {
208            // Root wrappers get the file path as their ID.
209            (id_gen.current_prefix(), None)
210        } else if let Some(scope) = named_scope {
211            let leaf = id_gen.record_name(&scope.name);
212            let prefix = id_gen.current_prefix();
213            (format!("{prefix}::{leaf}"), Some(leaf))
214        } else {
215            // All other nodes get positional IDs.
216            (id_gen.anonymous_id(), None)
217        };
218
219        // Determine the effective vertex kind. If the theory has extracted vertex kinds,
220        // use those for validation. If the kind is unknown to the theory AND the protocol
221        // has a closed obj_kinds list, fall back to "node".
222        let effective_kind = if self.protocol.obj_kinds.is_empty() {
223            // Open protocol: accept all kinds.
224            kind
225        } else if self.protocol.obj_kinds.iter().any(|k| k == kind) {
226            kind
227        } else if !self.theory_meta.vertex_kinds.is_empty()
228            && self.theory_meta.vertex_kinds.iter().any(|k| k == kind)
229        {
230            // Known in the auto-derived theory even if not in the protocol's obj_kinds.
231            kind
232        } else {
233            "node"
234        };
235
236        builder = builder
237            .vertex(&vertex_id, effective_kind, None)
238            .map_err(|e| ParseError::SchemaConstruction {
239                reason: format!("vertex '{vertex_id}' ({kind}): {e}"),
240            })?;
241
242        // Emit edge from parent to this node.
243        if let Some(parent_id) = parent_vertex_id {
244            // Determine edge kind: use the tree-sitter field name if this node
245            // was accessed via a field, otherwise use "child_of".
246            let edge_kind = node
247                .parent()
248                .and_then(|p| {
249                    // Find which field of the parent this node corresponds to.
250                    for i in 0..p.child_count() {
251                        if let Some(child) = p.child(u32::try_from(i).unwrap_or(0)) {
252                            if child.id() == node.id() {
253                                return u32::try_from(i)
254                                    .ok()
255                                    .and_then(|idx| p.field_name_for_child(idx));
256                            }
257                        }
258                    }
259                    None
260                })
261                .unwrap_or("child_of");
262
263            builder = builder
264                .edge(parent_id, &vertex_id, edge_kind, None)
265                .map_err(|e| ParseError::SchemaConstruction {
266                    reason: format!("edge {parent_id} -> {vertex_id} ({edge_kind}): {e}"),
267                })?;
268        }
269
270        // Store byte range for position-aware emission.
271        builder = builder.constraint(&vertex_id, "start-byte", &node.start_byte().to_string());
272        builder = builder.constraint(&vertex_id, "end-byte", &node.end_byte().to_string());
273
274        // Capture any leading bytes that precede the document root. Tree-sitter
275        // excludes leading `extra` tokens (a UTF-8 BOM, an awk `\<newline>`
276        // line-continuation, a leading comment) from the root node's span, so
277        // those bytes belong to no vertex's interstitial run and are dropped on
278        // replay. Record them as a `doc-prefix` constraint on the root vertex so
279        // the byte-faithful emit path can reproduce them verbatim. Only the
280        // document root (no parent) with a non-zero start byte carries this;
281        // every other vertex's leading gap is already a sibling interstitial.
282        if parent_vertex_id.is_none() && node.start_byte() > 0 {
283            if let Ok(prefix) = std::str::from_utf8(&self.source[..node.start_byte()]) {
284                if !prefix.is_empty() {
285                    builder = builder.constraint(&vertex_id, "doc-prefix", prefix);
286                }
287            }
288        }
289
290        // Record the pre-alias grammar symbol name when it differs from
291        // the post-alias kind. Tree-sitter 0.25 exposes `grammar_name`
292        // (the SYMBOL name as it appears in the rule body, before
293        // `ALIAS { value: V }` rewriting). This is the signal that
294        // disambiguates which production reached this child: when
295        // emit's CHOICE dispatcher sees two alternatives both yielding
296        // a child of kind `K`, one through `SYMBOL K` and one through
297        // `ALIAS { ..., value: K }`, the recorded `pre-alias-symbol`
298        // identifies which.
299        let grammar_name = node.grammar_name();
300        if grammar_name != kind {
301            builder = builder.constraint(&vertex_id, "pre-alias-symbol", grammar_name);
302        }
303
304        // Emit constraints for leaf nodes (literals, identifiers, operators).
305        if node.named_child_count() == 0 {
306            if let Ok(text) = node.utf8_text(self.source) {
307                builder = builder.constraint(&vertex_id, "literal-value", text);
308            }
309        }
310
311        // Capture field-keyed anonymous-token children as `field:<name>`
312        // constraints on this vertex. Tree-sitter rules of the form
313        // `field('op', choice('+', '-', '*', '/'))` produce children
314        // that are field-named but not themselves named nodes, so they
315        // are skipped by the named-child walk above and would otherwise
316        // be invisible to downstream consumers. Emitting the value here
317        // lets consumers read `schema.field_text(vid, name)` directly
318        // rather than reconstructing the text via start-byte / end-byte
319        // arithmetic against the source buffer.
320        builder = self.capture_anonymous_field_constraints(node, &vertex_id, builder);
321
322        // Capture the faithful production trace: the ordered linearization
323        // of this node's children as the parser tokenized them. Each slot
324        // is either `C<kind>` for a named child (a schema edge — the
325        // variant tag the emit review consumes) or `T<text>` for an
326        // anonymous grammar token (a layout literal). This is the
327        // variant-tag fibre of the layout complement; replaying it in the
328        // put direction is byte-faithful by construction (it is the Prism
329        // review *consuming* the parser's CHOICE decision rather than
330        // re-deriving it). Hidden rules are inlined by tree-sitter and
331        // never surface as their own child, so the trace aligns 1:1 with
332        // the emitter's cursor edges and grammar string literals.
333        builder = self.capture_production_trace(node, &vertex_id, builder);
334
335        // Emit formatting constraints if enabled.
336        if self.config.capture_formatting {
337            builder = self.emit_formatting_constraints(node, &vertex_id, builder);
338        }
339
340        // Enter scope if this is a scope-introducing node. For a
341        // named scope we reuse the disambiguated leaf computed above
342        // (so the frame name matches the trailing component of
343        // `vertex_id`); for an anonymous block we push a fresh
344        // positional frame.
345        let entered_scope = if let Some(leaf) = recorded_named_leaf {
346            id_gen.push_recorded_scope(leaf);
347            true
348        } else if !is_root_wrapper && self.block_kinds.contains(kind) {
349            id_gen.push_anonymous_scope();
350            true
351        } else {
352            false
353        };
354
355        builder = self.walk_children_with_interstitials(node, builder, id_gen, &vertex_id)?;
356
357        if entered_scope {
358            id_gen.pop_scope();
359        }
360
361        Ok(builder)
362    }
363
364    /// Walk named children, capturing interstitial text between them.
365    ///
366    /// Also computes a `chose-alt-fingerprint` constraint by trimming
367    /// and joining every non-empty interstitial run. This is the
368    /// categorical discriminator for the parent vertex's CHOICE alt:
369    /// it survives the byte-position-stripping that
370    /// `emit_pretty_roundtrip`'s by-construction simulation applies,
371    /// so the CHOICE picker can dispatch deterministically against
372    /// the recorded alternative even after interstitials are removed.
373    /// A by-construction schema can populate this constraint directly
374    /// to control which alternative the emitter picks.
375    fn walk_children_with_interstitials(
376        &self,
377        node: tree_sitter::Node<'_>,
378        mut builder: SchemaBuilder,
379        id_gen: &mut IdGenerator,
380        vertex_id: &str,
381    ) -> Result<SchemaBuilder, ParseError> {
382        let cursor = &mut node.walk();
383        let children: Vec<_> = node.named_children(cursor).collect();
384        let mut interstitial_idx = 0;
385        let mut prev_end = node.start_byte();
386        let mut fingerprint_parts: Vec<String> = Vec::new();
387        let mut child_kinds: Vec<String> = Vec::new();
388
389        for child in &children {
390            let gap_start = prev_end;
391            let gap_end = child.start_byte();
392            builder = self.capture_interstitial(
393                builder,
394                vertex_id,
395                gap_start,
396                gap_end,
397                &mut interstitial_idx,
398                &mut fingerprint_parts,
399            );
400            // Record the named child's kind separately from the
401            // literal-token fingerprint. The CHOICE picker reads the
402            // kind sequence as a secondary, tiebreaker witness when
403            // literal tokens alone don't discriminate alternatives.
404            // Hidden-rule kinds (`_`-prefixed) are tree-sitter
405            // implementation detail and never authored by humans;
406            // omitting them keeps the witness language-clean and
407            // matches the convention that hidden rules are inlined
408            // by the emitter rather than dispatched against.
409            let child_kind = child.kind();
410            if !child_kind.starts_with('_') {
411                child_kinds.push(child_kind.to_owned());
412            }
413            builder = self.walk_node(*child, builder, id_gen, Some(vertex_id))?;
414            prev_end = child.end_byte();
415        }
416
417        // Trailing interstitial after the last child.
418        builder = self.capture_interstitial(
419            builder,
420            vertex_id,
421            prev_end,
422            node.end_byte(),
423            &mut interstitial_idx,
424            &mut fingerprint_parts,
425        );
426
427        if !fingerprint_parts.is_empty() {
428            builder = builder.constraint(
429                vertex_id,
430                "chose-alt-fingerprint",
431                &fingerprint_parts.join(" "),
432            );
433        }
434        if !child_kinds.is_empty() {
435            builder =
436                builder.constraint(vertex_id, "chose-alt-child-kinds", &child_kinds.join(" "));
437        }
438
439        Ok(builder)
440    }
441
442    /// Capture interstitial text between `gap_start` and `gap_end` as a constraint.
443    /// Walk all children of `node` (including anonymous tokens), and for
444    /// each anonymous-token child that was reached through a tree-sitter
445    /// `field('<name>', ...)` accessor, emit a `field:<name>` constraint
446    /// on the parent vertex carrying the token's text.
447    ///
448    /// Tree-sitter rules like `field('direction', choice('/', '\\'))` or
449    /// `field('func', choice('sigmoid','exp','log','abs'))` attach a
450    /// field name to an unnamed token alternative. The named-children
451    /// walk in [`walk_children_with_interstitials`] omits these (they
452    /// are not named nodes), and downstream consumers previously had
453    /// to recover the value by reading the source buffer between
454    /// recorded byte offsets. This emits the value as a structural
455    /// constraint so [`Schema::field_text`] can return it directly.
456    fn capture_anonymous_field_constraints(
457        &self,
458        node: tree_sitter::Node<'_>,
459        vertex_id: &str,
460        mut builder: SchemaBuilder,
461    ) -> SchemaBuilder {
462        let child_count = node.child_count();
463        for i in 0..child_count {
464            let Some(child) = node.child(u32::try_from(i).unwrap_or(0)) else {
465                continue;
466            };
467            // Named children carry their own vertex (and surface as edges
468            // keyed by the field name in walk_node). We only need to
469            // handle the unnamed tokens here.
470            if child.is_named() {
471                continue;
472            }
473            let Some(field_name) = u32::try_from(i)
474                .ok()
475                .and_then(|idx| node.field_name_for_child(idx))
476            else {
477                continue;
478            };
479            let Ok(text) = child.utf8_text(self.source) else {
480                continue;
481            };
482            let sort = format!("field:{field_name}");
483            builder = builder.constraint(vertex_id, &sort, text);
484        }
485        builder
486    }
487
488    /// Record the faithful production trace `ptrace-<slot>` for `node`.
489    ///
490    /// Walks every child of `node` (named children and anonymous tokens,
491    /// in source order) and records each slot as `C<kind>` (a named
492    /// child) or `T<text>` (an anonymous token's exact text). This is the
493    /// variant-tag fibre of the layout complement
494    /// ([`panproto_lens::layout_complement::TraceSlot`]). See the call
495    /// site for why it is byte-faithful under replay.
496    fn capture_production_trace(
497        &self,
498        node: tree_sitter::Node<'_>,
499        vertex_id: &str,
500        mut builder: SchemaBuilder,
501    ) -> SchemaBuilder {
502        let count = node.child_count();
503        let mut slot = 0usize;
504        for i in 0..count {
505            let Some(child) = u32::try_from(i).ok().and_then(|i| node.child(i)) else {
506                continue;
507            };
508            let value = if child.is_named() {
509                format!("C{}", child.kind())
510            } else if let Ok(text) = child.utf8_text(self.source) {
511                format!("T{text}")
512            } else {
513                continue;
514            };
515            builder = builder.constraint(vertex_id, &format!("ptrace-{slot}"), &value);
516            slot += 1;
517        }
518        builder
519    }
520
521    fn capture_interstitial(
522        &self,
523        mut builder: SchemaBuilder,
524        vertex_id: &str,
525        gap_start: usize,
526        gap_end: usize,
527        idx: &mut usize,
528        fingerprint: &mut Vec<String>,
529    ) -> SchemaBuilder {
530        if gap_end > gap_start && gap_end <= self.source.len() {
531            if let Ok(gap_text) = std::str::from_utf8(&self.source[gap_start..gap_end]) {
532                if !gap_text.is_empty() {
533                    let sort = format!("interstitial-{}", *idx);
534                    builder = builder.constraint(vertex_id, &sort, gap_text);
535                    builder = builder.constraint(
536                        vertex_id,
537                        &format!("{sort}-start-byte"),
538                        &gap_start.to_string(),
539                    );
540                    *idx += 1;
541                    let trimmed = gap_text.trim();
542                    if !trimmed.is_empty() {
543                        fingerprint.push(trimmed.to_owned());
544                    }
545                }
546            }
547        }
548        builder
549    }
550
551    /// Emit formatting constraints for a node (indentation, position).
552    fn emit_formatting_constraints(
553        &self,
554        node: tree_sitter::Node<'_>,
555        vertex_id: &str,
556        mut builder: SchemaBuilder,
557    ) -> SchemaBuilder {
558        let start = node.start_position();
559
560        // Capture indentation (column of first character on the line).
561        if start.column > 0 {
562            // Extract the actual indentation characters from the source.
563            let line_start = node.start_byte().saturating_sub(start.column);
564            if line_start < self.source.len() {
565                let indent_end = line_start + start.column.min(self.source.len() - line_start);
566                if let Ok(indent) = std::str::from_utf8(&self.source[line_start..indent_end]) {
567                    // Only capture if the extracted region is pure whitespace.
568                    if !indent.is_empty() && indent.trim().is_empty() {
569                        builder = builder.constraint(vertex_id, "indent", indent);
570                    }
571                }
572            }
573        }
574
575        // Count blank lines before this node by looking at source between
576        // previous sibling's end and this node's start.
577        if let Some(prev) = node.prev_named_sibling() {
578            let gap_start = prev.end_byte();
579            let gap_end = node.start_byte();
580            if gap_start < gap_end && gap_end <= self.source.len() {
581                let gap = &self.source[gap_start..gap_end];
582                let blank_lines = memchr::memchr_iter(b'\n', gap).count().saturating_sub(1);
583                if blank_lines > 0 {
584                    builder = builder.constraint(
585                        vertex_id,
586                        "blank-lines-before",
587                        &blank_lines.to_string(),
588                    );
589                }
590            }
591        }
592
593        builder
594    }
595}
596
597#[cfg(test)]
598#[allow(clippy::unwrap_used)]
599mod tests {
600    use super::*;
601
602    fn make_test_protocol() -> Protocol {
603        Protocol {
604            name: "test".into(),
605            schema_theory: "ThTest".into(),
606            instance_theory: "ThTestInst".into(),
607            schema_composition: None,
608            instance_composition: None,
609            obj_kinds: vec![], // Empty = open protocol, accepts all kinds.
610            edge_rules: vec![],
611            constraint_sorts: vec![],
612            has_order: true,
613            has_coproducts: false,
614            has_recursion: false,
615            has_causal: false,
616            nominal_identity: false,
617            has_defaults: false,
618            has_coercions: false,
619            has_mergers: false,
620            has_policies: false,
621        }
622    }
623
624    fn make_test_meta() -> ExtractedTheoryMeta {
625        use panproto_gat::{Sort, Theory};
626        ExtractedTheoryMeta {
627            theory: Theory::new("ThTest", vec![Sort::simple("Vertex")], vec![], vec![]),
628            supertypes: FxHashSet::default(),
629            subtype_map: Vec::new(),
630            optional_fields: FxHashSet::default(),
631            ordered_fields: FxHashSet::default(),
632            vertex_kinds: Vec::new(),
633            edge_kinds: Vec::new(),
634        }
635    }
636
637    /// Helper to get a grammar by name from panproto-grammars.
638    #[cfg(feature = "grammars")]
639    fn get_grammar(name: &str) -> panproto_grammars::Grammar {
640        panproto_grammars::grammars()
641            .into_iter()
642            .find(|g| g.name == name)
643            .unwrap_or_else(|| panic!("grammar '{name}' not enabled in features"))
644    }
645
646    #[test]
647    #[cfg(feature = "grammars")]
648    fn walk_simple_typescript() {
649        let source = b"function greet(name: string): string { return name; }";
650        let grammar = get_grammar("typescript");
651
652        let mut parser = tree_sitter::Parser::new();
653        parser.set_language(&grammar.language).unwrap();
654        let tree = parser.parse(source, None).unwrap();
655
656        let protocol = make_test_protocol();
657        let meta = make_test_meta();
658        let mut detector =
659            crate::scope_detector::ScopeDetector::new(&grammar.language, grammar.tags_query, None)
660                .unwrap();
661        let walker = AstWalker::new(
662            source,
663            &meta,
664            &protocol,
665            WalkerConfig::standard(),
666            Some(&mut detector),
667        );
668
669        let schema = walker.walk(&tree, "test.ts").unwrap();
670
671        // Should have produced some vertices.
672        assert!(
673            schema.vertices.len() > 1,
674            "expected multiple vertices, got {}",
675            schema.vertices.len()
676        );
677
678        // The root should be the file.
679        let root_name: panproto_gat::Name = "test.ts".into();
680        assert!(
681            schema.vertices.contains_key(&root_name),
682            "missing root vertex"
683        );
684
685        // When tags.scm is present, the function name should appear in a vertex ID.
686        if detector.has_query() {
687            let has_greet = schema
688                .vertices
689                .keys()
690                .any(|n| n.to_string().ends_with("::greet"));
691            assert!(
692                has_greet,
693                "expected a vertex ID ending in ::greet, got: {:?}",
694                schema
695                    .vertices
696                    .keys()
697                    .map(ToString::to_string)
698                    .collect::<Vec<_>>()
699            );
700        }
701    }
702
703    #[test]
704    #[cfg(feature = "grammars")]
705    fn walk_simple_python() {
706        let source = b"def add(a, b):\n    return a + b\n";
707        let grammar = get_grammar("python");
708
709        let mut parser = tree_sitter::Parser::new();
710        parser.set_language(&grammar.language).unwrap();
711        let tree = parser.parse(source, None).unwrap();
712
713        let protocol = make_test_protocol();
714        let meta = make_test_meta();
715        let mut detector =
716            crate::scope_detector::ScopeDetector::new(&grammar.language, grammar.tags_query, None)
717                .unwrap();
718        let walker = AstWalker::new(
719            source,
720            &meta,
721            &protocol,
722            WalkerConfig::standard(),
723            Some(&mut detector),
724        );
725
726        let schema = walker.walk(&tree, "test.py").unwrap();
727
728        assert!(
729            schema.vertices.len() > 1,
730            "expected multiple vertices, got {}",
731            schema.vertices.len()
732        );
733
734        if detector.has_query() {
735            let has_add = schema
736                .vertices
737                .keys()
738                .any(|n| n.to_string().ends_with("::add"));
739            assert!(has_add, "expected ::add vertex");
740        }
741    }
742
743    #[test]
744    #[cfg(feature = "grammars")]
745    fn walk_simple_rust() {
746        let source = b"fn verify_push() {}\nstruct Foo;\nimpl Foo { fn bar(&self) {} }\n";
747        let grammar = get_grammar("rust");
748
749        let mut parser = tree_sitter::Parser::new();
750        parser.set_language(&grammar.language).unwrap();
751        let tree = parser.parse(source, None).unwrap();
752
753        let protocol = make_test_protocol();
754        let meta = make_test_meta();
755        let mut detector =
756            crate::scope_detector::ScopeDetector::new(&grammar.language, grammar.tags_query, None)
757                .unwrap();
758        let walker = AstWalker::new(
759            source,
760            &meta,
761            &protocol,
762            WalkerConfig::standard(),
763            Some(&mut detector),
764        );
765
766        let schema = walker.walk(&tree, "test.rs").unwrap();
767
768        assert!(
769            schema.vertices.len() > 1,
770            "expected multiple vertices, got {}",
771            schema.vertices.len()
772        );
773
774        if detector.has_query() {
775            let vertex_ids: Vec<String> = schema.vertices.keys().map(ToString::to_string).collect();
776
777            // Rust's function_item — the regression from issue #34 — must be
778            // detected as a named scope now.
779            assert!(
780                vertex_ids.iter().any(|id| id.ends_with("::verify_push")),
781                "expected ::verify_push named scope, got: {vertex_ids:?}"
782            );
783            assert!(
784                vertex_ids.iter().any(|id| id.ends_with("::Foo")),
785                "expected ::Foo named scope, got: {vertex_ids:?}"
786            );
787        }
788    }
789
790    /// Helper: parse source with a grammar, walk to Schema, emit back, compare.
791    #[cfg(feature = "group-data")]
792    fn assert_roundtrip(grammar_name: &str, source: &[u8], file_path: &str) {
793        use crate::registry::AstParser;
794        let grammar = panproto_grammars::grammars()
795            .into_iter()
796            .find(|g| g.name == grammar_name)
797            .unwrap_or_else(|| panic!("grammar '{grammar_name}' not enabled"));
798
799        let config = crate::languages::walker_configs::walker_config_for(grammar_name);
800        let lang_parser = crate::languages::common::LanguageParser::from_language(
801            grammar_name,
802            grammar.extensions.to_vec(),
803            grammar.language,
804            grammar.node_types,
805            grammar.tags_query,
806            config,
807        )
808        .unwrap();
809
810        let schema = lang_parser.parse(source, file_path).unwrap();
811        let emitted = lang_parser.emit(&schema).unwrap();
812
813        assert_eq!(
814            std::str::from_utf8(source).unwrap(),
815            std::str::from_utf8(&emitted).unwrap(),
816            "round-trip failed for {grammar_name}: emitted bytes differ from source"
817        );
818    }
819
820    #[test]
821    #[cfg(feature = "group-data")]
822    fn roundtrip_json_simple() {
823        assert_roundtrip("json", br#"{"name": "test", "value": 42}"#, "test.json");
824    }
825
826    #[test]
827    #[cfg(feature = "group-data")]
828    fn roundtrip_json_formatted() {
829        let source =
830            b"{\n  \"name\": \"test\",\n  \"value\": 42,\n  \"nested\": {\n    \"a\": true\n  }\n}";
831        assert_roundtrip("json", source, "test.json");
832    }
833
834    #[test]
835    #[cfg(feature = "group-data")]
836    fn roundtrip_json_array() {
837        let source = b"[\n  1,\n  2,\n  3\n]";
838        assert_roundtrip("json", source, "test.json");
839    }
840
841    #[test]
842    #[cfg(feature = "group-data")]
843    fn roundtrip_xml_simple() {
844        let source = b"<root>\n  <child attr=\"val\">text</child>\n</root>";
845        assert_roundtrip("xml", source, "test.xml");
846    }
847
848    #[test]
849    #[cfg(feature = "group-data")]
850    fn roundtrip_yaml_simple() {
851        let source = b"name: test\nvalue: 42\nnested:\n  a: true\n";
852        assert_roundtrip("yaml", source, "test.yaml");
853    }
854
855    #[test]
856    #[cfg(feature = "group-data")]
857    fn roundtrip_toml_simple() {
858        let source = b"[package]\nname = \"test\"\nversion = \"0.1.0\"\n";
859        assert_roundtrip("toml", source, "test.toml");
860    }
861
862    #[cfg(feature = "group-data")]
863    fn parse_with(grammar_name: &str, source: &[u8], file_path: &str) -> panproto_schema::Schema {
864        use crate::registry::AstParser;
865        let grammar = panproto_grammars::grammars()
866            .into_iter()
867            .find(|g| g.name == grammar_name)
868            .unwrap_or_else(|| panic!("grammar '{grammar_name}' not enabled"));
869        let config = crate::languages::walker_configs::walker_config_for(grammar_name);
870        let lang_parser = crate::languages::common::LanguageParser::from_language(
871            grammar_name,
872            grammar.extensions.to_vec(),
873            grammar.language,
874            grammar.node_types,
875            grammar.tags_query,
876            config,
877        )
878        .unwrap();
879        lang_parser.parse(source, file_path).unwrap()
880    }
881
882    #[test]
883    #[cfg(feature = "group-data")]
884    fn fingerprint_and_child_kinds_emitted_separately() {
885        // The walker must emit `chose-alt-fingerprint` and
886        // `chose-alt-child-kinds` as TWO distinct constraints. The
887        // CHOICE picker reads them independently: literal-token
888        // matches drive primary scoring, child-kind matches act as a
889        // tiebreaker. Mixing them would let punctuation in kind names
890        // contaminate the literal score.
891        let schema = parse_with("json", br#"{"a": 1}"#, "test.json");
892
893        let saw_fingerprint = schema.constraints.values().any(|cs| {
894            cs.iter()
895                .any(|c| c.sort.as_ref() == "chose-alt-fingerprint")
896        });
897        let saw_child_kinds = schema.constraints.values().any(|cs| {
898            cs.iter()
899                .any(|c| c.sort.as_ref() == "chose-alt-child-kinds")
900        });
901
902        assert!(
903            saw_fingerprint,
904            "walker must emit chose-alt-fingerprint (literal-token witness)"
905        );
906        assert!(
907            saw_child_kinds,
908            "walker must emit chose-alt-child-kinds (named-kind witness)"
909        );
910    }
911
912    #[test]
913    #[cfg(feature = "group-data")]
914    fn child_kinds_excludes_hidden_rules() {
915        // Hidden rules (`_`-prefixed) are tree-sitter implementation
916        // detail and must not appear in the kind witness.
917        let schema = parse_with("json", br#"{"k": "v"}"#, "test.json");
918
919        for cs in schema.constraints.values() {
920            for c in cs {
921                if c.sort.as_ref() == "chose-alt-child-kinds" {
922                    for kind in c.value.split_whitespace() {
923                        assert!(
924                            !kind.starts_with('_'),
925                            "hidden-rule kind '{kind}' must not appear in chose-alt-child-kinds"
926                        );
927                    }
928                }
929            }
930        }
931    }
932}