1use std::collections::BTreeMap;
17
18use panproto_schema::{Protocol, Schema, SchemaBuilder};
19use rustc_hash::FxHashSet;
20
21use crate::error::ParseError;
22use crate::id_scheme::IdGenerator;
23use crate::scope_detector::{NamedScope, ScopeDetector};
24use crate::theory_extract::ExtractedTheoryMeta;
25
26const BLOCK_KINDS: &[&str] = &[
33 "block",
34 "statement_block",
35 "compound_statement",
36 "declaration_list",
37 "field_declaration_list",
38 "enum_body",
39 "class_body",
40 "interface_body",
41 "module_body",
42];
43
44#[derive(Debug, Clone, Default)]
46pub struct WalkerConfig {
47 pub extra_block_kinds: Vec<String>,
55 pub capture_comments: bool,
57 pub capture_formatting: bool,
59}
60
61impl WalkerConfig {
62 #[must_use]
65 pub const fn standard() -> Self {
66 Self {
67 extra_block_kinds: Vec::new(),
68 capture_comments: true,
69 capture_formatting: true,
70 }
71 }
72}
73
74pub struct AstWalker<'a> {
83 source: &'a [u8],
85 theory_meta: &'a ExtractedTheoryMeta,
89 protocol: &'a Protocol,
91 config: WalkerConfig,
93 block_kinds: FxHashSet<String>,
95 scope_map: BTreeMap<(usize, usize), NamedScope>,
99}
100
101impl<'a> AstWalker<'a> {
102 #[must_use]
113 pub fn new(
114 source: &'a [u8],
115 theory_meta: &'a ExtractedTheoryMeta,
116 protocol: &'a Protocol,
117 config: WalkerConfig,
118 scope_detector: Option<&mut ScopeDetector>,
119 ) -> Self {
120 let mut block_kinds: FxHashSet<String> =
121 BLOCK_KINDS.iter().map(|s| (*s).to_owned()).collect();
122 for kind in &config.extra_block_kinds {
123 block_kinds.insert(kind.clone());
124 }
125
126 let mut scope_map: BTreeMap<(usize, usize), NamedScope> = BTreeMap::new();
127 if let Some(det) = scope_detector {
128 for scope in det.scopes(source) {
129 scope_map.insert((scope.node_range.start, scope.node_range.end), scope);
130 }
131 }
132
133 Self {
134 source,
135 theory_meta,
136 protocol,
137 config,
138 block_kinds,
139 scope_map,
140 }
141 }
142
143 pub fn walk(&self, tree: &tree_sitter::Tree, file_path: &str) -> Result<Schema, ParseError> {
149 let mut id_gen = IdGenerator::new(file_path);
150 let builder = SchemaBuilder::new(self.protocol);
151 let root = tree.root_node();
152
153 let builder = self.walk_node(root, builder, &mut id_gen, None)?;
154
155 builder.build().map_err(|e| ParseError::SchemaConstruction {
156 reason: e.to_string(),
157 })
158 }
159
160 fn scope_for(&self, node: tree_sitter::Node<'_>) -> Option<&NamedScope> {
162 self.scope_map.get(&(node.start_byte(), node.end_byte()))
163 }
164
165 fn walk_node(
167 &self,
168 node: tree_sitter::Node<'_>,
169 mut builder: SchemaBuilder,
170 id_gen: &mut IdGenerator,
171 parent_vertex_id: Option<&str>,
172 ) -> Result<SchemaBuilder, ParseError> {
173 if !node.is_named() {
175 return Ok(builder);
176 }
177
178 let kind = node.kind();
179
180 let is_root_wrapper = parent_vertex_id.is_none()
183 && (kind == "program"
184 || kind == "source_file"
185 || kind == "module"
186 || kind == "translation_unit");
187
188 let named_scope = if is_root_wrapper {
189 None
190 } else {
191 self.scope_for(node)
192 };
193
194 let vertex_id = if is_root_wrapper {
196 id_gen.current_prefix()
198 } else if let Some(scope) = named_scope {
199 id_gen.named_id(&scope.name)
200 } else {
201 id_gen.anonymous_id()
203 };
204
205 let effective_kind = if self.protocol.obj_kinds.is_empty() {
209 kind
211 } else if self.protocol.obj_kinds.iter().any(|k| k == kind) {
212 kind
213 } else if !self.theory_meta.vertex_kinds.is_empty()
214 && self.theory_meta.vertex_kinds.iter().any(|k| k == kind)
215 {
216 kind
218 } else {
219 "node"
220 };
221
222 builder = builder
223 .vertex(&vertex_id, effective_kind, None)
224 .map_err(|e| ParseError::SchemaConstruction {
225 reason: format!("vertex '{vertex_id}' ({kind}): {e}"),
226 })?;
227
228 if let Some(parent_id) = parent_vertex_id {
230 let edge_kind = node
233 .parent()
234 .and_then(|p| {
235 for i in 0..p.child_count() {
237 if let Some(child) = p.child(i) {
238 if child.id() == node.id() {
239 return u32::try_from(i)
240 .ok()
241 .and_then(|idx| p.field_name_for_child(idx));
242 }
243 }
244 }
245 None
246 })
247 .unwrap_or("child_of");
248
249 builder = builder
250 .edge(parent_id, &vertex_id, edge_kind, None)
251 .map_err(|e| ParseError::SchemaConstruction {
252 reason: format!("edge {parent_id} -> {vertex_id} ({edge_kind}): {e}"),
253 })?;
254 }
255
256 builder = builder.constraint(&vertex_id, "start-byte", &node.start_byte().to_string());
258 builder = builder.constraint(&vertex_id, "end-byte", &node.end_byte().to_string());
259
260 if node.named_child_count() == 0 {
262 if let Ok(text) = node.utf8_text(self.source) {
263 builder = builder.constraint(&vertex_id, "literal-value", text);
264 }
265 }
266
267 builder = self.capture_anonymous_field_constraints(node, &vertex_id, builder);
277
278 if self.config.capture_formatting {
280 builder = self.emit_formatting_constraints(node, &vertex_id, builder);
281 }
282
283 let entered_scope = if let Some(scope) = named_scope {
285 id_gen.push_named_scope(&scope.name);
286 true
287 } else if !is_root_wrapper && self.block_kinds.contains(kind) {
288 id_gen.push_anonymous_scope();
289 true
290 } else {
291 false
292 };
293
294 builder = self.walk_children_with_interstitials(node, builder, id_gen, &vertex_id)?;
295
296 if entered_scope {
297 id_gen.pop_scope();
298 }
299
300 Ok(builder)
301 }
302
303 fn walk_children_with_interstitials(
315 &self,
316 node: tree_sitter::Node<'_>,
317 mut builder: SchemaBuilder,
318 id_gen: &mut IdGenerator,
319 vertex_id: &str,
320 ) -> Result<SchemaBuilder, ParseError> {
321 let cursor = &mut node.walk();
322 let children: Vec<_> = node.named_children(cursor).collect();
323 let mut interstitial_idx = 0;
324 let mut prev_end = node.start_byte();
325 let mut fingerprint_parts: Vec<String> = Vec::new();
326 let mut child_kinds: Vec<String> = Vec::new();
327
328 for child in &children {
329 let gap_start = prev_end;
330 let gap_end = child.start_byte();
331 builder = self.capture_interstitial(
332 builder,
333 vertex_id,
334 gap_start,
335 gap_end,
336 &mut interstitial_idx,
337 &mut fingerprint_parts,
338 );
339 let child_kind = child.kind();
349 if !child_kind.starts_with('_') {
350 child_kinds.push(child_kind.to_owned());
351 }
352 builder = self.walk_node(*child, builder, id_gen, Some(vertex_id))?;
353 prev_end = child.end_byte();
354 }
355
356 builder = self.capture_interstitial(
358 builder,
359 vertex_id,
360 prev_end,
361 node.end_byte(),
362 &mut interstitial_idx,
363 &mut fingerprint_parts,
364 );
365
366 if !fingerprint_parts.is_empty() {
367 builder = builder.constraint(
368 vertex_id,
369 "chose-alt-fingerprint",
370 &fingerprint_parts.join(" "),
371 );
372 }
373 if !child_kinds.is_empty() {
374 builder =
375 builder.constraint(vertex_id, "chose-alt-child-kinds", &child_kinds.join(" "));
376 }
377
378 Ok(builder)
379 }
380
381 fn capture_anonymous_field_constraints(
396 &self,
397 node: tree_sitter::Node<'_>,
398 vertex_id: &str,
399 mut builder: SchemaBuilder,
400 ) -> SchemaBuilder {
401 let child_count = node.child_count();
402 for i in 0..child_count {
403 let Some(child) = node.child(i) else { continue };
404 if child.is_named() {
408 continue;
409 }
410 let Some(field_name) = u32::try_from(i)
411 .ok()
412 .and_then(|idx| node.field_name_for_child(idx))
413 else {
414 continue;
415 };
416 let Ok(text) = child.utf8_text(self.source) else {
417 continue;
418 };
419 let sort = format!("field:{field_name}");
420 builder = builder.constraint(vertex_id, &sort, text);
421 }
422 builder
423 }
424
425 fn capture_interstitial(
426 &self,
427 mut builder: SchemaBuilder,
428 vertex_id: &str,
429 gap_start: usize,
430 gap_end: usize,
431 idx: &mut usize,
432 fingerprint: &mut Vec<String>,
433 ) -> SchemaBuilder {
434 if gap_end > gap_start && gap_end <= self.source.len() {
435 if let Ok(gap_text) = std::str::from_utf8(&self.source[gap_start..gap_end]) {
436 if !gap_text.is_empty() {
437 let sort = format!("interstitial-{}", *idx);
438 builder = builder.constraint(vertex_id, &sort, gap_text);
439 builder = builder.constraint(
440 vertex_id,
441 &format!("{sort}-start-byte"),
442 &gap_start.to_string(),
443 );
444 *idx += 1;
445 let trimmed = gap_text.trim();
446 if !trimmed.is_empty() {
447 fingerprint.push(trimmed.to_owned());
448 }
449 }
450 }
451 }
452 builder
453 }
454
455 fn emit_formatting_constraints(
457 &self,
458 node: tree_sitter::Node<'_>,
459 vertex_id: &str,
460 mut builder: SchemaBuilder,
461 ) -> SchemaBuilder {
462 let start = node.start_position();
463
464 if start.column > 0 {
466 let line_start = node.start_byte().saturating_sub(start.column);
468 if line_start < self.source.len() {
469 let indent_end = line_start + start.column.min(self.source.len() - line_start);
470 if let Ok(indent) = std::str::from_utf8(&self.source[line_start..indent_end]) {
471 if !indent.is_empty() && indent.trim().is_empty() {
473 builder = builder.constraint(vertex_id, "indent", indent);
474 }
475 }
476 }
477 }
478
479 if let Some(prev) = node.prev_named_sibling() {
482 let gap_start = prev.end_byte();
483 let gap_end = node.start_byte();
484 if gap_start < gap_end && gap_end <= self.source.len() {
485 let gap = &self.source[gap_start..gap_end];
486 let blank_lines = memchr::memchr_iter(b'\n', gap).count().saturating_sub(1);
487 if blank_lines > 0 {
488 builder = builder.constraint(
489 vertex_id,
490 "blank-lines-before",
491 &blank_lines.to_string(),
492 );
493 }
494 }
495 }
496
497 builder
498 }
499}
500
501#[cfg(test)]
502#[allow(clippy::unwrap_used)]
503mod tests {
504 use super::*;
505
506 fn make_test_protocol() -> Protocol {
507 Protocol {
508 name: "test".into(),
509 schema_theory: "ThTest".into(),
510 instance_theory: "ThTestInst".into(),
511 schema_composition: None,
512 instance_composition: None,
513 obj_kinds: vec![], edge_rules: vec![],
515 constraint_sorts: vec![],
516 has_order: true,
517 has_coproducts: false,
518 has_recursion: false,
519 has_causal: false,
520 nominal_identity: false,
521 has_defaults: false,
522 has_coercions: false,
523 has_mergers: false,
524 has_policies: false,
525 }
526 }
527
528 fn make_test_meta() -> ExtractedTheoryMeta {
529 use panproto_gat::{Sort, Theory};
530 ExtractedTheoryMeta {
531 theory: Theory::new("ThTest", vec![Sort::simple("Vertex")], vec![], vec![]),
532 supertypes: FxHashSet::default(),
533 subtype_map: Vec::new(),
534 optional_fields: FxHashSet::default(),
535 ordered_fields: FxHashSet::default(),
536 vertex_kinds: Vec::new(),
537 edge_kinds: Vec::new(),
538 }
539 }
540
541 #[cfg(feature = "grammars")]
543 fn get_grammar(name: &str) -> panproto_grammars::Grammar {
544 panproto_grammars::grammars()
545 .into_iter()
546 .find(|g| g.name == name)
547 .unwrap_or_else(|| panic!("grammar '{name}' not enabled in features"))
548 }
549
550 #[test]
551 #[cfg(feature = "grammars")]
552 fn walk_simple_typescript() {
553 let source = b"function greet(name: string): string { return name; }";
554 let grammar = get_grammar("typescript");
555
556 let mut parser = tree_sitter::Parser::new();
557 parser.set_language(&grammar.language).unwrap();
558 let tree = parser.parse(source, None).unwrap();
559
560 let protocol = make_test_protocol();
561 let meta = make_test_meta();
562 let mut detector =
563 crate::scope_detector::ScopeDetector::new(&grammar.language, grammar.tags_query, None)
564 .unwrap();
565 let walker = AstWalker::new(
566 source,
567 &meta,
568 &protocol,
569 WalkerConfig::standard(),
570 Some(&mut detector),
571 );
572
573 let schema = walker.walk(&tree, "test.ts").unwrap();
574
575 assert!(
577 schema.vertices.len() > 1,
578 "expected multiple vertices, got {}",
579 schema.vertices.len()
580 );
581
582 let root_name: panproto_gat::Name = "test.ts".into();
584 assert!(
585 schema.vertices.contains_key(&root_name),
586 "missing root vertex"
587 );
588
589 if detector.has_query() {
591 let has_greet = schema
592 .vertices
593 .keys()
594 .any(|n| n.to_string().ends_with("::greet"));
595 assert!(
596 has_greet,
597 "expected a vertex ID ending in ::greet, got: {:?}",
598 schema
599 .vertices
600 .keys()
601 .map(ToString::to_string)
602 .collect::<Vec<_>>()
603 );
604 }
605 }
606
607 #[test]
608 #[cfg(feature = "grammars")]
609 fn walk_simple_python() {
610 let source = b"def add(a, b):\n return a + b\n";
611 let grammar = get_grammar("python");
612
613 let mut parser = tree_sitter::Parser::new();
614 parser.set_language(&grammar.language).unwrap();
615 let tree = parser.parse(source, None).unwrap();
616
617 let protocol = make_test_protocol();
618 let meta = make_test_meta();
619 let mut detector =
620 crate::scope_detector::ScopeDetector::new(&grammar.language, grammar.tags_query, None)
621 .unwrap();
622 let walker = AstWalker::new(
623 source,
624 &meta,
625 &protocol,
626 WalkerConfig::standard(),
627 Some(&mut detector),
628 );
629
630 let schema = walker.walk(&tree, "test.py").unwrap();
631
632 assert!(
633 schema.vertices.len() > 1,
634 "expected multiple vertices, got {}",
635 schema.vertices.len()
636 );
637
638 if detector.has_query() {
639 let has_add = schema
640 .vertices
641 .keys()
642 .any(|n| n.to_string().ends_with("::add"));
643 assert!(has_add, "expected ::add vertex");
644 }
645 }
646
647 #[test]
648 #[cfg(feature = "grammars")]
649 fn walk_simple_rust() {
650 let source = b"fn verify_push() {}\nstruct Foo;\nimpl Foo { fn bar(&self) {} }\n";
651 let grammar = get_grammar("rust");
652
653 let mut parser = tree_sitter::Parser::new();
654 parser.set_language(&grammar.language).unwrap();
655 let tree = parser.parse(source, None).unwrap();
656
657 let protocol = make_test_protocol();
658 let meta = make_test_meta();
659 let mut detector =
660 crate::scope_detector::ScopeDetector::new(&grammar.language, grammar.tags_query, None)
661 .unwrap();
662 let walker = AstWalker::new(
663 source,
664 &meta,
665 &protocol,
666 WalkerConfig::standard(),
667 Some(&mut detector),
668 );
669
670 let schema = walker.walk(&tree, "test.rs").unwrap();
671
672 assert!(
673 schema.vertices.len() > 1,
674 "expected multiple vertices, got {}",
675 schema.vertices.len()
676 );
677
678 if detector.has_query() {
679 let vertex_ids: Vec<String> = schema.vertices.keys().map(ToString::to_string).collect();
680
681 assert!(
684 vertex_ids.iter().any(|id| id.ends_with("::verify_push")),
685 "expected ::verify_push named scope, got: {vertex_ids:?}"
686 );
687 assert!(
688 vertex_ids.iter().any(|id| id.ends_with("::Foo")),
689 "expected ::Foo named scope, got: {vertex_ids:?}"
690 );
691 }
692 }
693
694 #[cfg(feature = "group-data")]
696 fn assert_roundtrip(grammar_name: &str, source: &[u8], file_path: &str) {
697 use crate::registry::AstParser;
698 let grammar = panproto_grammars::grammars()
699 .into_iter()
700 .find(|g| g.name == grammar_name)
701 .unwrap_or_else(|| panic!("grammar '{grammar_name}' not enabled"));
702
703 let config = crate::languages::walker_configs::walker_config_for(grammar_name);
704 let lang_parser = crate::languages::common::LanguageParser::from_language(
705 grammar_name,
706 grammar.extensions.to_vec(),
707 grammar.language,
708 grammar.node_types,
709 grammar.tags_query,
710 config,
711 )
712 .unwrap();
713
714 let schema = lang_parser.parse(source, file_path).unwrap();
715 let emitted = lang_parser.emit(&schema).unwrap();
716
717 assert_eq!(
718 std::str::from_utf8(source).unwrap(),
719 std::str::from_utf8(&emitted).unwrap(),
720 "round-trip failed for {grammar_name}: emitted bytes differ from source"
721 );
722 }
723
724 #[test]
725 #[cfg(feature = "group-data")]
726 fn roundtrip_json_simple() {
727 assert_roundtrip("json", br#"{"name": "test", "value": 42}"#, "test.json");
728 }
729
730 #[test]
731 #[cfg(feature = "group-data")]
732 fn roundtrip_json_formatted() {
733 let source =
734 b"{\n \"name\": \"test\",\n \"value\": 42,\n \"nested\": {\n \"a\": true\n }\n}";
735 assert_roundtrip("json", source, "test.json");
736 }
737
738 #[test]
739 #[cfg(feature = "group-data")]
740 fn roundtrip_json_array() {
741 let source = b"[\n 1,\n 2,\n 3\n]";
742 assert_roundtrip("json", source, "test.json");
743 }
744
745 #[test]
746 #[cfg(feature = "group-data")]
747 fn roundtrip_xml_simple() {
748 let source = b"<root>\n <child attr=\"val\">text</child>\n</root>";
749 assert_roundtrip("xml", source, "test.xml");
750 }
751
752 #[test]
753 #[cfg(feature = "group-data")]
754 fn roundtrip_yaml_simple() {
755 let source = b"name: test\nvalue: 42\nnested:\n a: true\n";
756 assert_roundtrip("yaml", source, "test.yaml");
757 }
758
759 #[test]
760 #[cfg(feature = "group-data")]
761 fn roundtrip_toml_simple() {
762 let source = b"[package]\nname = \"test\"\nversion = \"0.1.0\"\n";
763 assert_roundtrip("toml", source, "test.toml");
764 }
765
766 #[cfg(feature = "group-data")]
767 fn parse_with(grammar_name: &str, source: &[u8], file_path: &str) -> panproto_schema::Schema {
768 use crate::registry::AstParser;
769 let grammar = panproto_grammars::grammars()
770 .into_iter()
771 .find(|g| g.name == grammar_name)
772 .unwrap_or_else(|| panic!("grammar '{grammar_name}' not enabled"));
773 let config = crate::languages::walker_configs::walker_config_for(grammar_name);
774 let lang_parser = crate::languages::common::LanguageParser::from_language(
775 grammar_name,
776 grammar.extensions.to_vec(),
777 grammar.language,
778 grammar.node_types,
779 grammar.tags_query,
780 config,
781 )
782 .unwrap();
783 lang_parser.parse(source, file_path).unwrap()
784 }
785
786 #[test]
787 #[cfg(feature = "group-data")]
788 fn fingerprint_and_child_kinds_emitted_separately() {
789 let schema = parse_with("json", br#"{"a": 1}"#, "test.json");
796
797 let saw_fingerprint = schema.constraints.values().any(|cs| {
798 cs.iter()
799 .any(|c| c.sort.as_ref() == "chose-alt-fingerprint")
800 });
801 let saw_child_kinds = schema.constraints.values().any(|cs| {
802 cs.iter()
803 .any(|c| c.sort.as_ref() == "chose-alt-child-kinds")
804 });
805
806 assert!(
807 saw_fingerprint,
808 "walker must emit chose-alt-fingerprint (literal-token witness)"
809 );
810 assert!(
811 saw_child_kinds,
812 "walker must emit chose-alt-child-kinds (named-kind witness)"
813 );
814 }
815
816 #[test]
817 #[cfg(feature = "group-data")]
818 fn child_kinds_excludes_hidden_rules() {
819 let schema = parse_with("json", br#"{"k": "v"}"#, "test.json");
822
823 for cs in schema.constraints.values() {
824 for c in cs {
825 if c.sort.as_ref() == "chose-alt-child-kinds" {
826 for kind in c.value.split_whitespace() {
827 assert!(
828 !kind.starts_with('_'),
829 "hidden-rule kind '{kind}' must not appear in chose-alt-child-kinds"
830 );
831 }
832 }
833 }
834 }
835 }
836}