1use std::collections::BTreeMap;
17
18use panproto_schema::{Protocol, Schema, SchemaBuilder};
19use rustc_hash::FxHashSet;
20
21use crate::error::ParseError;
22use crate::id_scheme::IdGenerator;
23use crate::scope_detector::{NamedScope, ScopeDetector};
24use crate::theory_extract::ExtractedTheoryMeta;
25
26const BLOCK_KINDS: &[&str] = &[
33 "block",
34 "statement_block",
35 "compound_statement",
36 "declaration_list",
37 "field_declaration_list",
38 "enum_body",
39 "class_body",
40 "interface_body",
41 "module_body",
42];
43
44#[derive(Debug, Clone, Default)]
46pub struct WalkerConfig {
47 pub extra_block_kinds: Vec<String>,
55 pub capture_comments: bool,
57 pub capture_formatting: bool,
59}
60
61impl WalkerConfig {
62 #[must_use]
65 pub const fn standard() -> Self {
66 Self {
67 extra_block_kinds: Vec::new(),
68 capture_comments: true,
69 capture_formatting: true,
70 }
71 }
72}
73
74pub struct AstWalker<'a> {
83 source: &'a [u8],
85 theory_meta: &'a ExtractedTheoryMeta,
89 protocol: &'a Protocol,
91 config: WalkerConfig,
93 block_kinds: FxHashSet<String>,
95 scope_map: BTreeMap<(usize, usize), NamedScope>,
99}
100
101impl<'a> AstWalker<'a> {
102 #[must_use]
113 pub fn new(
114 source: &'a [u8],
115 theory_meta: &'a ExtractedTheoryMeta,
116 protocol: &'a Protocol,
117 config: WalkerConfig,
118 scope_detector: Option<&mut ScopeDetector>,
119 ) -> Self {
120 let mut block_kinds: FxHashSet<String> =
121 BLOCK_KINDS.iter().map(|s| (*s).to_owned()).collect();
122 for kind in &config.extra_block_kinds {
123 block_kinds.insert(kind.clone());
124 }
125
126 let mut scope_map: BTreeMap<(usize, usize), NamedScope> = BTreeMap::new();
127 if let Some(det) = scope_detector {
128 for scope in det.scopes(source) {
129 scope_map.insert((scope.node_range.start, scope.node_range.end), scope);
130 }
131 }
132
133 Self {
134 source,
135 theory_meta,
136 protocol,
137 config,
138 block_kinds,
139 scope_map,
140 }
141 }
142
143 pub fn walk(&self, tree: &tree_sitter::Tree, file_path: &str) -> Result<Schema, ParseError> {
149 let mut id_gen = IdGenerator::new(file_path);
150 let builder = SchemaBuilder::new(self.protocol);
151 let root = tree.root_node();
152
153 let builder = self.walk_node(root, builder, &mut id_gen, None)?;
154
155 builder.build().map_err(|e| ParseError::SchemaConstruction {
156 reason: e.to_string(),
157 })
158 }
159
160 fn scope_for(&self, node: tree_sitter::Node<'_>) -> Option<&NamedScope> {
162 self.scope_map.get(&(node.start_byte(), node.end_byte()))
163 }
164
165 fn walk_node(
167 &self,
168 node: tree_sitter::Node<'_>,
169 mut builder: SchemaBuilder,
170 id_gen: &mut IdGenerator,
171 parent_vertex_id: Option<&str>,
172 ) -> Result<SchemaBuilder, ParseError> {
173 if !node.is_named() {
175 return Ok(builder);
176 }
177
178 let kind = node.kind();
179
180 let is_root_wrapper = parent_vertex_id.is_none()
183 && (kind == "program"
184 || kind == "source_file"
185 || kind == "module"
186 || kind == "translation_unit");
187
188 let named_scope = if is_root_wrapper {
189 None
190 } else {
191 self.scope_for(node)
192 };
193
194 let (vertex_id, recorded_named_leaf) = if is_root_wrapper {
208 (id_gen.current_prefix(), None)
210 } else if let Some(scope) = named_scope {
211 let leaf = id_gen.record_name(&scope.name);
212 let prefix = id_gen.current_prefix();
213 (format!("{prefix}::{leaf}"), Some(leaf))
214 } else {
215 (id_gen.anonymous_id(), None)
217 };
218
219 let effective_kind = if self.protocol.obj_kinds.is_empty() {
223 kind
225 } else if self.protocol.obj_kinds.iter().any(|k| k == kind) {
226 kind
227 } else if !self.theory_meta.vertex_kinds.is_empty()
228 && self.theory_meta.vertex_kinds.iter().any(|k| k == kind)
229 {
230 kind
232 } else {
233 "node"
234 };
235
236 builder = builder
237 .vertex(&vertex_id, effective_kind, None)
238 .map_err(|e| ParseError::SchemaConstruction {
239 reason: format!("vertex '{vertex_id}' ({kind}): {e}"),
240 })?;
241
242 if let Some(parent_id) = parent_vertex_id {
244 let edge_kind = node
247 .parent()
248 .and_then(|p| {
249 for i in 0..p.child_count() {
251 if let Some(child) = p.child(u32::try_from(i).unwrap_or(0)) {
252 if child.id() == node.id() {
253 return u32::try_from(i)
254 .ok()
255 .and_then(|idx| p.field_name_for_child(idx));
256 }
257 }
258 }
259 None
260 })
261 .unwrap_or("child_of");
262
263 builder = builder
264 .edge(parent_id, &vertex_id, edge_kind, None)
265 .map_err(|e| ParseError::SchemaConstruction {
266 reason: format!("edge {parent_id} -> {vertex_id} ({edge_kind}): {e}"),
267 })?;
268 }
269
270 builder = builder.constraint(&vertex_id, "start-byte", &node.start_byte().to_string());
272 builder = builder.constraint(&vertex_id, "end-byte", &node.end_byte().to_string());
273
274 if parent_vertex_id.is_none() && node.start_byte() > 0 {
283 if let Ok(prefix) = std::str::from_utf8(&self.source[..node.start_byte()]) {
284 if !prefix.is_empty() {
285 builder = builder.constraint(&vertex_id, "doc-prefix", prefix);
286 }
287 }
288 }
289
290 let grammar_name = node.grammar_name();
300 if grammar_name != kind {
301 builder = builder.constraint(&vertex_id, "pre-alias-symbol", grammar_name);
302 }
303
304 if node.named_child_count() == 0 {
306 if let Ok(text) = node.utf8_text(self.source) {
307 builder = builder.constraint(&vertex_id, "literal-value", text);
308 }
309 }
310
311 builder = self.capture_anonymous_field_constraints(node, &vertex_id, builder);
321
322 builder = self.capture_production_trace(node, &vertex_id, builder);
334
335 if self.config.capture_formatting {
337 builder = self.emit_formatting_constraints(node, &vertex_id, builder);
338 }
339
340 let entered_scope = if let Some(leaf) = recorded_named_leaf {
346 id_gen.push_recorded_scope(leaf);
347 true
348 } else if !is_root_wrapper && self.block_kinds.contains(kind) {
349 id_gen.push_anonymous_scope();
350 true
351 } else {
352 false
353 };
354
355 builder = self.walk_children_with_interstitials(node, builder, id_gen, &vertex_id)?;
356
357 if entered_scope {
358 id_gen.pop_scope();
359 }
360
361 Ok(builder)
362 }
363
364 fn walk_children_with_interstitials(
376 &self,
377 node: tree_sitter::Node<'_>,
378 mut builder: SchemaBuilder,
379 id_gen: &mut IdGenerator,
380 vertex_id: &str,
381 ) -> Result<SchemaBuilder, ParseError> {
382 let cursor = &mut node.walk();
383 let children: Vec<_> = node.named_children(cursor).collect();
384 let mut interstitial_idx = 0;
385 let mut prev_end = node.start_byte();
386 let mut fingerprint_parts: Vec<String> = Vec::new();
387 let mut child_kinds: Vec<String> = Vec::new();
388
389 for child in &children {
390 let gap_start = prev_end;
391 let gap_end = child.start_byte();
392 builder = self.capture_interstitial(
393 builder,
394 vertex_id,
395 gap_start,
396 gap_end,
397 &mut interstitial_idx,
398 &mut fingerprint_parts,
399 );
400 let child_kind = child.kind();
410 if !child_kind.starts_with('_') {
411 child_kinds.push(child_kind.to_owned());
412 }
413 builder = self.walk_node(*child, builder, id_gen, Some(vertex_id))?;
414 prev_end = child.end_byte();
415 }
416
417 builder = self.capture_interstitial(
419 builder,
420 vertex_id,
421 prev_end,
422 node.end_byte(),
423 &mut interstitial_idx,
424 &mut fingerprint_parts,
425 );
426
427 if !fingerprint_parts.is_empty() {
428 builder = builder.constraint(
429 vertex_id,
430 "chose-alt-fingerprint",
431 &fingerprint_parts.join(" "),
432 );
433 }
434 if !child_kinds.is_empty() {
435 builder =
436 builder.constraint(vertex_id, "chose-alt-child-kinds", &child_kinds.join(" "));
437 }
438
439 Ok(builder)
440 }
441
442 fn capture_anonymous_field_constraints(
457 &self,
458 node: tree_sitter::Node<'_>,
459 vertex_id: &str,
460 mut builder: SchemaBuilder,
461 ) -> SchemaBuilder {
462 let child_count = node.child_count();
463 for i in 0..child_count {
464 let Some(child) = node.child(u32::try_from(i).unwrap_or(0)) else {
465 continue;
466 };
467 if child.is_named() {
471 continue;
472 }
473 let Some(field_name) = u32::try_from(i)
474 .ok()
475 .and_then(|idx| node.field_name_for_child(idx))
476 else {
477 continue;
478 };
479 let Ok(text) = child.utf8_text(self.source) else {
480 continue;
481 };
482 let sort = format!("field:{field_name}");
483 builder = builder.constraint(vertex_id, &sort, text);
484 }
485 builder
486 }
487
488 fn capture_production_trace(
497 &self,
498 node: tree_sitter::Node<'_>,
499 vertex_id: &str,
500 mut builder: SchemaBuilder,
501 ) -> SchemaBuilder {
502 let count = node.child_count();
503 let mut slot = 0usize;
504 for i in 0..count {
505 let Some(child) = u32::try_from(i).ok().and_then(|i| node.child(i)) else {
506 continue;
507 };
508 let value = if child.is_named() {
509 format!("C{}", child.kind())
510 } else if let Ok(text) = child.utf8_text(self.source) {
511 format!("T{text}")
512 } else {
513 continue;
514 };
515 builder = builder.constraint(vertex_id, &format!("ptrace-{slot}"), &value);
516 slot += 1;
517 }
518 builder
519 }
520
521 fn capture_interstitial(
522 &self,
523 mut builder: SchemaBuilder,
524 vertex_id: &str,
525 gap_start: usize,
526 gap_end: usize,
527 idx: &mut usize,
528 fingerprint: &mut Vec<String>,
529 ) -> SchemaBuilder {
530 if gap_end > gap_start && gap_end <= self.source.len() {
531 if let Ok(gap_text) = std::str::from_utf8(&self.source[gap_start..gap_end]) {
532 if !gap_text.is_empty() {
533 let sort = format!("interstitial-{}", *idx);
534 builder = builder.constraint(vertex_id, &sort, gap_text);
535 builder = builder.constraint(
536 vertex_id,
537 &format!("{sort}-start-byte"),
538 &gap_start.to_string(),
539 );
540 *idx += 1;
541 let trimmed = gap_text.trim();
542 if !trimmed.is_empty() {
543 fingerprint.push(trimmed.to_owned());
544 }
545 }
546 }
547 }
548 builder
549 }
550
551 fn emit_formatting_constraints(
553 &self,
554 node: tree_sitter::Node<'_>,
555 vertex_id: &str,
556 mut builder: SchemaBuilder,
557 ) -> SchemaBuilder {
558 let start = node.start_position();
559
560 if start.column > 0 {
562 let line_start = node.start_byte().saturating_sub(start.column);
564 if line_start < self.source.len() {
565 let indent_end = line_start + start.column.min(self.source.len() - line_start);
566 if let Ok(indent) = std::str::from_utf8(&self.source[line_start..indent_end]) {
567 if !indent.is_empty() && indent.trim().is_empty() {
569 builder = builder.constraint(vertex_id, "indent", indent);
570 }
571 }
572 }
573 }
574
575 if let Some(prev) = node.prev_named_sibling() {
578 let gap_start = prev.end_byte();
579 let gap_end = node.start_byte();
580 if gap_start < gap_end && gap_end <= self.source.len() {
581 let gap = &self.source[gap_start..gap_end];
582 let blank_lines = memchr::memchr_iter(b'\n', gap).count().saturating_sub(1);
583 if blank_lines > 0 {
584 builder = builder.constraint(
585 vertex_id,
586 "blank-lines-before",
587 &blank_lines.to_string(),
588 );
589 }
590 }
591 }
592
593 builder
594 }
595}
596
597#[cfg(test)]
598#[allow(clippy::unwrap_used)]
599mod tests {
600 use super::*;
601
602 fn make_test_protocol() -> Protocol {
603 Protocol {
604 name: "test".into(),
605 schema_theory: "ThTest".into(),
606 instance_theory: "ThTestInst".into(),
607 schema_composition: None,
608 instance_composition: None,
609 obj_kinds: vec![], edge_rules: vec![],
611 constraint_sorts: vec![],
612 has_order: true,
613 has_coproducts: false,
614 has_recursion: false,
615 has_causal: false,
616 nominal_identity: false,
617 has_defaults: false,
618 has_coercions: false,
619 has_mergers: false,
620 has_policies: false,
621 }
622 }
623
624 fn make_test_meta() -> ExtractedTheoryMeta {
625 use panproto_gat::{Sort, Theory};
626 ExtractedTheoryMeta {
627 theory: Theory::new("ThTest", vec![Sort::simple("Vertex")], vec![], vec![]),
628 supertypes: FxHashSet::default(),
629 subtype_map: Vec::new(),
630 optional_fields: FxHashSet::default(),
631 ordered_fields: FxHashSet::default(),
632 vertex_kinds: Vec::new(),
633 edge_kinds: Vec::new(),
634 }
635 }
636
637 #[cfg(feature = "grammars")]
639 fn get_grammar(name: &str) -> panproto_grammars::Grammar {
640 panproto_grammars::grammars()
641 .into_iter()
642 .find(|g| g.name == name)
643 .unwrap_or_else(|| panic!("grammar '{name}' not enabled in features"))
644 }
645
646 #[test]
647 #[cfg(feature = "grammars")]
648 fn walk_simple_typescript() {
649 let source = b"function greet(name: string): string { return name; }";
650 let grammar = get_grammar("typescript");
651
652 let mut parser = tree_sitter::Parser::new();
653 parser.set_language(&grammar.language).unwrap();
654 let tree = parser.parse(source, None).unwrap();
655
656 let protocol = make_test_protocol();
657 let meta = make_test_meta();
658 let mut detector =
659 crate::scope_detector::ScopeDetector::new(&grammar.language, grammar.tags_query, None)
660 .unwrap();
661 let walker = AstWalker::new(
662 source,
663 &meta,
664 &protocol,
665 WalkerConfig::standard(),
666 Some(&mut detector),
667 );
668
669 let schema = walker.walk(&tree, "test.ts").unwrap();
670
671 assert!(
673 schema.vertices.len() > 1,
674 "expected multiple vertices, got {}",
675 schema.vertices.len()
676 );
677
678 let root_name: panproto_gat::Name = "test.ts".into();
680 assert!(
681 schema.vertices.contains_key(&root_name),
682 "missing root vertex"
683 );
684
685 if detector.has_query() {
687 let has_greet = schema
688 .vertices
689 .keys()
690 .any(|n| n.to_string().ends_with("::greet"));
691 assert!(
692 has_greet,
693 "expected a vertex ID ending in ::greet, got: {:?}",
694 schema
695 .vertices
696 .keys()
697 .map(ToString::to_string)
698 .collect::<Vec<_>>()
699 );
700 }
701 }
702
703 #[test]
704 #[cfg(feature = "grammars")]
705 fn walk_simple_python() {
706 let source = b"def add(a, b):\n return a + b\n";
707 let grammar = get_grammar("python");
708
709 let mut parser = tree_sitter::Parser::new();
710 parser.set_language(&grammar.language).unwrap();
711 let tree = parser.parse(source, None).unwrap();
712
713 let protocol = make_test_protocol();
714 let meta = make_test_meta();
715 let mut detector =
716 crate::scope_detector::ScopeDetector::new(&grammar.language, grammar.tags_query, None)
717 .unwrap();
718 let walker = AstWalker::new(
719 source,
720 &meta,
721 &protocol,
722 WalkerConfig::standard(),
723 Some(&mut detector),
724 );
725
726 let schema = walker.walk(&tree, "test.py").unwrap();
727
728 assert!(
729 schema.vertices.len() > 1,
730 "expected multiple vertices, got {}",
731 schema.vertices.len()
732 );
733
734 if detector.has_query() {
735 let has_add = schema
736 .vertices
737 .keys()
738 .any(|n| n.to_string().ends_with("::add"));
739 assert!(has_add, "expected ::add vertex");
740 }
741 }
742
743 #[test]
744 #[cfg(feature = "grammars")]
745 fn walk_simple_rust() {
746 let source = b"fn verify_push() {}\nstruct Foo;\nimpl Foo { fn bar(&self) {} }\n";
747 let grammar = get_grammar("rust");
748
749 let mut parser = tree_sitter::Parser::new();
750 parser.set_language(&grammar.language).unwrap();
751 let tree = parser.parse(source, None).unwrap();
752
753 let protocol = make_test_protocol();
754 let meta = make_test_meta();
755 let mut detector =
756 crate::scope_detector::ScopeDetector::new(&grammar.language, grammar.tags_query, None)
757 .unwrap();
758 let walker = AstWalker::new(
759 source,
760 &meta,
761 &protocol,
762 WalkerConfig::standard(),
763 Some(&mut detector),
764 );
765
766 let schema = walker.walk(&tree, "test.rs").unwrap();
767
768 assert!(
769 schema.vertices.len() > 1,
770 "expected multiple vertices, got {}",
771 schema.vertices.len()
772 );
773
774 if detector.has_query() {
775 let vertex_ids: Vec<String> = schema.vertices.keys().map(ToString::to_string).collect();
776
777 assert!(
780 vertex_ids.iter().any(|id| id.ends_with("::verify_push")),
781 "expected ::verify_push named scope, got: {vertex_ids:?}"
782 );
783 assert!(
784 vertex_ids.iter().any(|id| id.ends_with("::Foo")),
785 "expected ::Foo named scope, got: {vertex_ids:?}"
786 );
787 }
788 }
789
790 #[cfg(feature = "group-data")]
792 fn assert_roundtrip(grammar_name: &str, source: &[u8], file_path: &str) {
793 use crate::registry::AstParser;
794 let grammar = panproto_grammars::grammars()
795 .into_iter()
796 .find(|g| g.name == grammar_name)
797 .unwrap_or_else(|| panic!("grammar '{grammar_name}' not enabled"));
798
799 let config = crate::languages::walker_configs::walker_config_for(grammar_name);
800 let lang_parser = crate::languages::common::LanguageParser::from_language(
801 grammar_name,
802 grammar.extensions.to_vec(),
803 grammar.language,
804 grammar.node_types,
805 grammar.tags_query,
806 config,
807 )
808 .unwrap();
809
810 let schema = lang_parser.parse(source, file_path).unwrap();
811 let emitted = lang_parser.emit(&schema).unwrap();
812
813 assert_eq!(
814 std::str::from_utf8(source).unwrap(),
815 std::str::from_utf8(&emitted).unwrap(),
816 "round-trip failed for {grammar_name}: emitted bytes differ from source"
817 );
818 }
819
820 #[test]
821 #[cfg(feature = "group-data")]
822 fn roundtrip_json_simple() {
823 assert_roundtrip("json", br#"{"name": "test", "value": 42}"#, "test.json");
824 }
825
826 #[test]
827 #[cfg(feature = "group-data")]
828 fn roundtrip_json_formatted() {
829 let source =
830 b"{\n \"name\": \"test\",\n \"value\": 42,\n \"nested\": {\n \"a\": true\n }\n}";
831 assert_roundtrip("json", source, "test.json");
832 }
833
834 #[test]
835 #[cfg(feature = "group-data")]
836 fn roundtrip_json_array() {
837 let source = b"[\n 1,\n 2,\n 3\n]";
838 assert_roundtrip("json", source, "test.json");
839 }
840
841 #[test]
842 #[cfg(feature = "group-data")]
843 fn roundtrip_xml_simple() {
844 let source = b"<root>\n <child attr=\"val\">text</child>\n</root>";
845 assert_roundtrip("xml", source, "test.xml");
846 }
847
848 #[test]
849 #[cfg(feature = "group-data")]
850 fn roundtrip_yaml_simple() {
851 let source = b"name: test\nvalue: 42\nnested:\n a: true\n";
852 assert_roundtrip("yaml", source, "test.yaml");
853 }
854
855 #[test]
856 #[cfg(feature = "group-data")]
857 fn roundtrip_toml_simple() {
858 let source = b"[package]\nname = \"test\"\nversion = \"0.1.0\"\n";
859 assert_roundtrip("toml", source, "test.toml");
860 }
861
862 #[cfg(feature = "group-data")]
863 fn parse_with(grammar_name: &str, source: &[u8], file_path: &str) -> panproto_schema::Schema {
864 use crate::registry::AstParser;
865 let grammar = panproto_grammars::grammars()
866 .into_iter()
867 .find(|g| g.name == grammar_name)
868 .unwrap_or_else(|| panic!("grammar '{grammar_name}' not enabled"));
869 let config = crate::languages::walker_configs::walker_config_for(grammar_name);
870 let lang_parser = crate::languages::common::LanguageParser::from_language(
871 grammar_name,
872 grammar.extensions.to_vec(),
873 grammar.language,
874 grammar.node_types,
875 grammar.tags_query,
876 config,
877 )
878 .unwrap();
879 lang_parser.parse(source, file_path).unwrap()
880 }
881
882 #[test]
883 #[cfg(feature = "group-data")]
884 fn fingerprint_and_child_kinds_emitted_separately() {
885 let schema = parse_with("json", br#"{"a": 1}"#, "test.json");
892
893 let saw_fingerprint = schema.constraints.values().any(|cs| {
894 cs.iter()
895 .any(|c| c.sort.as_ref() == "chose-alt-fingerprint")
896 });
897 let saw_child_kinds = schema.constraints.values().any(|cs| {
898 cs.iter()
899 .any(|c| c.sort.as_ref() == "chose-alt-child-kinds")
900 });
901
902 assert!(
903 saw_fingerprint,
904 "walker must emit chose-alt-fingerprint (literal-token witness)"
905 );
906 assert!(
907 saw_child_kinds,
908 "walker must emit chose-alt-child-kinds (named-kind witness)"
909 );
910 }
911
912 #[test]
913 #[cfg(feature = "group-data")]
914 fn child_kinds_excludes_hidden_rules() {
915 let schema = parse_with("json", br#"{"k": "v"}"#, "test.json");
918
919 for cs in schema.constraints.values() {
920 for c in cs {
921 if c.sort.as_ref() == "chose-alt-child-kinds" {
922 for kind in c.value.split_whitespace() {
923 assert!(
924 !kind.starts_with('_'),
925 "hidden-rule kind '{kind}' must not appear in chose-alt-child-kinds"
926 );
927 }
928 }
929 }
930 }
931 }
932}