1use std::collections::BTreeMap;
17
18use panproto_schema::{Protocol, Schema, SchemaBuilder};
19use rustc_hash::FxHashSet;
20
21use crate::error::ParseError;
22use crate::id_scheme::IdGenerator;
23use crate::scope_detector::{NamedScope, ScopeDetector};
24use crate::theory_extract::ExtractedTheoryMeta;
25
26const BLOCK_KINDS: &[&str] = &[
33 "block",
34 "statement_block",
35 "compound_statement",
36 "declaration_list",
37 "field_declaration_list",
38 "enum_body",
39 "class_body",
40 "interface_body",
41 "module_body",
42];
43
44#[derive(Debug, Clone, Default)]
46pub struct WalkerConfig {
47 pub extra_block_kinds: Vec<String>,
55 pub capture_comments: bool,
57 pub capture_formatting: bool,
59}
60
61impl WalkerConfig {
62 #[must_use]
65 pub const fn standard() -> Self {
66 Self {
67 extra_block_kinds: Vec::new(),
68 capture_comments: true,
69 capture_formatting: true,
70 }
71 }
72}
73
74pub struct AstWalker<'a> {
83 source: &'a [u8],
85 theory_meta: &'a ExtractedTheoryMeta,
89 protocol: &'a Protocol,
91 config: WalkerConfig,
93 block_kinds: FxHashSet<String>,
95 scope_map: BTreeMap<(usize, usize), NamedScope>,
99}
100
101impl<'a> AstWalker<'a> {
102 #[must_use]
113 pub fn new(
114 source: &'a [u8],
115 theory_meta: &'a ExtractedTheoryMeta,
116 protocol: &'a Protocol,
117 config: WalkerConfig,
118 scope_detector: Option<&mut ScopeDetector>,
119 ) -> Self {
120 let mut block_kinds: FxHashSet<String> =
121 BLOCK_KINDS.iter().map(|s| (*s).to_owned()).collect();
122 for kind in &config.extra_block_kinds {
123 block_kinds.insert(kind.clone());
124 }
125
126 let mut scope_map: BTreeMap<(usize, usize), NamedScope> = BTreeMap::new();
127 if let Some(det) = scope_detector {
128 for scope in det.scopes(source) {
129 scope_map.insert((scope.node_range.start, scope.node_range.end), scope);
130 }
131 }
132
133 Self {
134 source,
135 theory_meta,
136 protocol,
137 config,
138 block_kinds,
139 scope_map,
140 }
141 }
142
143 pub fn walk(&self, tree: &tree_sitter::Tree, file_path: &str) -> Result<Schema, ParseError> {
149 let mut id_gen = IdGenerator::new(file_path);
150 let builder = SchemaBuilder::new(self.protocol);
151 let root = tree.root_node();
152
153 let builder = self.walk_node(root, builder, &mut id_gen, None)?;
154
155 builder.build().map_err(|e| ParseError::SchemaConstruction {
156 reason: e.to_string(),
157 })
158 }
159
160 fn scope_for(&self, node: tree_sitter::Node<'_>) -> Option<&NamedScope> {
162 self.scope_map.get(&(node.start_byte(), node.end_byte()))
163 }
164
165 fn walk_node(
167 &self,
168 node: tree_sitter::Node<'_>,
169 mut builder: SchemaBuilder,
170 id_gen: &mut IdGenerator,
171 parent_vertex_id: Option<&str>,
172 ) -> Result<SchemaBuilder, ParseError> {
173 if !node.is_named() {
175 return Ok(builder);
176 }
177
178 let kind = node.kind();
179
180 let is_root_wrapper = parent_vertex_id.is_none()
183 && (kind == "program"
184 || kind == "source_file"
185 || kind == "module"
186 || kind == "translation_unit");
187
188 let named_scope = if is_root_wrapper {
189 None
190 } else {
191 self.scope_for(node)
192 };
193
194 let vertex_id = if is_root_wrapper {
196 id_gen.current_prefix()
198 } else if let Some(scope) = named_scope {
199 id_gen.named_id(&scope.name)
200 } else {
201 id_gen.anonymous_id()
203 };
204
205 let effective_kind = if self.protocol.obj_kinds.is_empty() {
209 kind
211 } else if self.protocol.obj_kinds.iter().any(|k| k == kind) {
212 kind
213 } else if !self.theory_meta.vertex_kinds.is_empty()
214 && self.theory_meta.vertex_kinds.iter().any(|k| k == kind)
215 {
216 kind
218 } else {
219 "node"
220 };
221
222 builder = builder
223 .vertex(&vertex_id, effective_kind, None)
224 .map_err(|e| ParseError::SchemaConstruction {
225 reason: format!("vertex '{vertex_id}' ({kind}): {e}"),
226 })?;
227
228 if let Some(parent_id) = parent_vertex_id {
230 let edge_kind = node
233 .parent()
234 .and_then(|p| {
235 for i in 0..p.child_count() {
237 if let Some(child) = p.child(i) {
238 if child.id() == node.id() {
239 return u32::try_from(i)
240 .ok()
241 .and_then(|idx| p.field_name_for_child(idx));
242 }
243 }
244 }
245 None
246 })
247 .unwrap_or("child_of");
248
249 builder = builder
250 .edge(parent_id, &vertex_id, edge_kind, None)
251 .map_err(|e| ParseError::SchemaConstruction {
252 reason: format!("edge {parent_id} -> {vertex_id} ({edge_kind}): {e}"),
253 })?;
254 }
255
256 builder = builder.constraint(&vertex_id, "start-byte", &node.start_byte().to_string());
258 builder = builder.constraint(&vertex_id, "end-byte", &node.end_byte().to_string());
259
260 if node.named_child_count() == 0 {
262 if let Ok(text) = node.utf8_text(self.source) {
263 builder = builder.constraint(&vertex_id, "literal-value", text);
264 }
265 }
266
267 if self.config.capture_formatting {
269 builder = self.emit_formatting_constraints(node, &vertex_id, builder);
270 }
271
272 let entered_scope = if let Some(scope) = named_scope {
274 id_gen.push_named_scope(&scope.name);
275 true
276 } else if !is_root_wrapper && self.block_kinds.contains(kind) {
277 id_gen.push_anonymous_scope();
278 true
279 } else {
280 false
281 };
282
283 builder = self.walk_children_with_interstitials(node, builder, id_gen, &vertex_id)?;
284
285 if entered_scope {
286 id_gen.pop_scope();
287 }
288
289 Ok(builder)
290 }
291
292 fn walk_children_with_interstitials(
304 &self,
305 node: tree_sitter::Node<'_>,
306 mut builder: SchemaBuilder,
307 id_gen: &mut IdGenerator,
308 vertex_id: &str,
309 ) -> Result<SchemaBuilder, ParseError> {
310 let cursor = &mut node.walk();
311 let children: Vec<_> = node.named_children(cursor).collect();
312 let mut interstitial_idx = 0;
313 let mut prev_end = node.start_byte();
314 let mut fingerprint_parts: Vec<String> = Vec::new();
315 let mut child_kinds: Vec<String> = Vec::new();
316
317 for child in &children {
318 let gap_start = prev_end;
319 let gap_end = child.start_byte();
320 builder = self.capture_interstitial(
321 builder,
322 vertex_id,
323 gap_start,
324 gap_end,
325 &mut interstitial_idx,
326 &mut fingerprint_parts,
327 );
328 let child_kind = child.kind();
338 if !child_kind.starts_with('_') {
339 child_kinds.push(child_kind.to_owned());
340 }
341 builder = self.walk_node(*child, builder, id_gen, Some(vertex_id))?;
342 prev_end = child.end_byte();
343 }
344
345 builder = self.capture_interstitial(
347 builder,
348 vertex_id,
349 prev_end,
350 node.end_byte(),
351 &mut interstitial_idx,
352 &mut fingerprint_parts,
353 );
354
355 if !fingerprint_parts.is_empty() {
356 builder = builder.constraint(
357 vertex_id,
358 "chose-alt-fingerprint",
359 &fingerprint_parts.join(" "),
360 );
361 }
362 if !child_kinds.is_empty() {
363 builder =
364 builder.constraint(vertex_id, "chose-alt-child-kinds", &child_kinds.join(" "));
365 }
366
367 Ok(builder)
368 }
369
370 fn capture_interstitial(
372 &self,
373 mut builder: SchemaBuilder,
374 vertex_id: &str,
375 gap_start: usize,
376 gap_end: usize,
377 idx: &mut usize,
378 fingerprint: &mut Vec<String>,
379 ) -> SchemaBuilder {
380 if gap_end > gap_start && gap_end <= self.source.len() {
381 if let Ok(gap_text) = std::str::from_utf8(&self.source[gap_start..gap_end]) {
382 if !gap_text.is_empty() {
383 let sort = format!("interstitial-{}", *idx);
384 builder = builder.constraint(vertex_id, &sort, gap_text);
385 builder = builder.constraint(
386 vertex_id,
387 &format!("{sort}-start-byte"),
388 &gap_start.to_string(),
389 );
390 *idx += 1;
391 let trimmed = gap_text.trim();
392 if !trimmed.is_empty() {
393 fingerprint.push(trimmed.to_owned());
394 }
395 }
396 }
397 }
398 builder
399 }
400
401 fn emit_formatting_constraints(
403 &self,
404 node: tree_sitter::Node<'_>,
405 vertex_id: &str,
406 mut builder: SchemaBuilder,
407 ) -> SchemaBuilder {
408 let start = node.start_position();
409
410 if start.column > 0 {
412 let line_start = node.start_byte().saturating_sub(start.column);
414 if line_start < self.source.len() {
415 let indent_end = line_start + start.column.min(self.source.len() - line_start);
416 if let Ok(indent) = std::str::from_utf8(&self.source[line_start..indent_end]) {
417 if !indent.is_empty() && indent.trim().is_empty() {
419 builder = builder.constraint(vertex_id, "indent", indent);
420 }
421 }
422 }
423 }
424
425 if let Some(prev) = node.prev_named_sibling() {
428 let gap_start = prev.end_byte();
429 let gap_end = node.start_byte();
430 if gap_start < gap_end && gap_end <= self.source.len() {
431 let gap = &self.source[gap_start..gap_end];
432 let blank_lines = memchr::memchr_iter(b'\n', gap).count().saturating_sub(1);
433 if blank_lines > 0 {
434 builder = builder.constraint(
435 vertex_id,
436 "blank-lines-before",
437 &blank_lines.to_string(),
438 );
439 }
440 }
441 }
442
443 builder
444 }
445}
446
447#[cfg(test)]
448#[allow(clippy::unwrap_used)]
449mod tests {
450 use super::*;
451
452 fn make_test_protocol() -> Protocol {
453 Protocol {
454 name: "test".into(),
455 schema_theory: "ThTest".into(),
456 instance_theory: "ThTestInst".into(),
457 schema_composition: None,
458 instance_composition: None,
459 obj_kinds: vec![], edge_rules: vec![],
461 constraint_sorts: vec![],
462 has_order: true,
463 has_coproducts: false,
464 has_recursion: false,
465 has_causal: false,
466 nominal_identity: false,
467 has_defaults: false,
468 has_coercions: false,
469 has_mergers: false,
470 has_policies: false,
471 }
472 }
473
474 fn make_test_meta() -> ExtractedTheoryMeta {
475 use panproto_gat::{Sort, Theory};
476 ExtractedTheoryMeta {
477 theory: Theory::new("ThTest", vec![Sort::simple("Vertex")], vec![], vec![]),
478 supertypes: FxHashSet::default(),
479 subtype_map: Vec::new(),
480 optional_fields: FxHashSet::default(),
481 ordered_fields: FxHashSet::default(),
482 vertex_kinds: Vec::new(),
483 edge_kinds: Vec::new(),
484 }
485 }
486
487 #[cfg(feature = "grammars")]
489 fn get_grammar(name: &str) -> panproto_grammars::Grammar {
490 panproto_grammars::grammars()
491 .into_iter()
492 .find(|g| g.name == name)
493 .unwrap_or_else(|| panic!("grammar '{name}' not enabled in features"))
494 }
495
496 #[test]
497 #[cfg(feature = "grammars")]
498 fn walk_simple_typescript() {
499 let source = b"function greet(name: string): string { return name; }";
500 let grammar = get_grammar("typescript");
501
502 let mut parser = tree_sitter::Parser::new();
503 parser.set_language(&grammar.language).unwrap();
504 let tree = parser.parse(source, None).unwrap();
505
506 let protocol = make_test_protocol();
507 let meta = make_test_meta();
508 let mut detector =
509 crate::scope_detector::ScopeDetector::new(&grammar.language, grammar.tags_query, None)
510 .unwrap();
511 let walker = AstWalker::new(
512 source,
513 &meta,
514 &protocol,
515 WalkerConfig::standard(),
516 Some(&mut detector),
517 );
518
519 let schema = walker.walk(&tree, "test.ts").unwrap();
520
521 assert!(
523 schema.vertices.len() > 1,
524 "expected multiple vertices, got {}",
525 schema.vertices.len()
526 );
527
528 let root_name: panproto_gat::Name = "test.ts".into();
530 assert!(
531 schema.vertices.contains_key(&root_name),
532 "missing root vertex"
533 );
534
535 if detector.has_query() {
537 let has_greet = schema
538 .vertices
539 .keys()
540 .any(|n| n.to_string().ends_with("::greet"));
541 assert!(
542 has_greet,
543 "expected a vertex ID ending in ::greet, got: {:?}",
544 schema
545 .vertices
546 .keys()
547 .map(ToString::to_string)
548 .collect::<Vec<_>>()
549 );
550 }
551 }
552
553 #[test]
554 #[cfg(feature = "grammars")]
555 fn walk_simple_python() {
556 let source = b"def add(a, b):\n return a + b\n";
557 let grammar = get_grammar("python");
558
559 let mut parser = tree_sitter::Parser::new();
560 parser.set_language(&grammar.language).unwrap();
561 let tree = parser.parse(source, None).unwrap();
562
563 let protocol = make_test_protocol();
564 let meta = make_test_meta();
565 let mut detector =
566 crate::scope_detector::ScopeDetector::new(&grammar.language, grammar.tags_query, None)
567 .unwrap();
568 let walker = AstWalker::new(
569 source,
570 &meta,
571 &protocol,
572 WalkerConfig::standard(),
573 Some(&mut detector),
574 );
575
576 let schema = walker.walk(&tree, "test.py").unwrap();
577
578 assert!(
579 schema.vertices.len() > 1,
580 "expected multiple vertices, got {}",
581 schema.vertices.len()
582 );
583
584 if detector.has_query() {
585 let has_add = schema
586 .vertices
587 .keys()
588 .any(|n| n.to_string().ends_with("::add"));
589 assert!(has_add, "expected ::add vertex");
590 }
591 }
592
593 #[test]
594 #[cfg(feature = "grammars")]
595 fn walk_simple_rust() {
596 let source = b"fn verify_push() {}\nstruct Foo;\nimpl Foo { fn bar(&self) {} }\n";
597 let grammar = get_grammar("rust");
598
599 let mut parser = tree_sitter::Parser::new();
600 parser.set_language(&grammar.language).unwrap();
601 let tree = parser.parse(source, None).unwrap();
602
603 let protocol = make_test_protocol();
604 let meta = make_test_meta();
605 let mut detector =
606 crate::scope_detector::ScopeDetector::new(&grammar.language, grammar.tags_query, None)
607 .unwrap();
608 let walker = AstWalker::new(
609 source,
610 &meta,
611 &protocol,
612 WalkerConfig::standard(),
613 Some(&mut detector),
614 );
615
616 let schema = walker.walk(&tree, "test.rs").unwrap();
617
618 assert!(
619 schema.vertices.len() > 1,
620 "expected multiple vertices, got {}",
621 schema.vertices.len()
622 );
623
624 if detector.has_query() {
625 let vertex_ids: Vec<String> = schema.vertices.keys().map(ToString::to_string).collect();
626
627 assert!(
630 vertex_ids.iter().any(|id| id.ends_with("::verify_push")),
631 "expected ::verify_push named scope, got: {vertex_ids:?}"
632 );
633 assert!(
634 vertex_ids.iter().any(|id| id.ends_with("::Foo")),
635 "expected ::Foo named scope, got: {vertex_ids:?}"
636 );
637 }
638 }
639
640 #[cfg(feature = "group-data")]
642 fn assert_roundtrip(grammar_name: &str, source: &[u8], file_path: &str) {
643 use crate::registry::AstParser;
644 let grammar = panproto_grammars::grammars()
645 .into_iter()
646 .find(|g| g.name == grammar_name)
647 .unwrap_or_else(|| panic!("grammar '{grammar_name}' not enabled"));
648
649 let config = crate::languages::walker_configs::walker_config_for(grammar_name);
650 let lang_parser = crate::languages::common::LanguageParser::from_language(
651 grammar_name,
652 grammar.extensions.to_vec(),
653 grammar.language,
654 grammar.node_types,
655 grammar.tags_query,
656 config,
657 )
658 .unwrap();
659
660 let schema = lang_parser.parse(source, file_path).unwrap();
661 let emitted = lang_parser.emit(&schema).unwrap();
662
663 assert_eq!(
664 std::str::from_utf8(source).unwrap(),
665 std::str::from_utf8(&emitted).unwrap(),
666 "round-trip failed for {grammar_name}: emitted bytes differ from source"
667 );
668 }
669
670 #[test]
671 #[cfg(feature = "group-data")]
672 fn roundtrip_json_simple() {
673 assert_roundtrip("json", br#"{"name": "test", "value": 42}"#, "test.json");
674 }
675
676 #[test]
677 #[cfg(feature = "group-data")]
678 fn roundtrip_json_formatted() {
679 let source =
680 b"{\n \"name\": \"test\",\n \"value\": 42,\n \"nested\": {\n \"a\": true\n }\n}";
681 assert_roundtrip("json", source, "test.json");
682 }
683
684 #[test]
685 #[cfg(feature = "group-data")]
686 fn roundtrip_json_array() {
687 let source = b"[\n 1,\n 2,\n 3\n]";
688 assert_roundtrip("json", source, "test.json");
689 }
690
691 #[test]
692 #[cfg(feature = "group-data")]
693 fn roundtrip_xml_simple() {
694 let source = b"<root>\n <child attr=\"val\">text</child>\n</root>";
695 assert_roundtrip("xml", source, "test.xml");
696 }
697
698 #[test]
699 #[cfg(feature = "group-data")]
700 fn roundtrip_yaml_simple() {
701 let source = b"name: test\nvalue: 42\nnested:\n a: true\n";
702 assert_roundtrip("yaml", source, "test.yaml");
703 }
704
705 #[test]
706 #[cfg(feature = "group-data")]
707 fn roundtrip_toml_simple() {
708 let source = b"[package]\nname = \"test\"\nversion = \"0.1.0\"\n";
709 assert_roundtrip("toml", source, "test.toml");
710 }
711
712 #[cfg(feature = "group-data")]
713 fn parse_with(grammar_name: &str, source: &[u8], file_path: &str) -> panproto_schema::Schema {
714 use crate::registry::AstParser;
715 let grammar = panproto_grammars::grammars()
716 .into_iter()
717 .find(|g| g.name == grammar_name)
718 .unwrap_or_else(|| panic!("grammar '{grammar_name}' not enabled"));
719 let config = crate::languages::walker_configs::walker_config_for(grammar_name);
720 let lang_parser = crate::languages::common::LanguageParser::from_language(
721 grammar_name,
722 grammar.extensions.to_vec(),
723 grammar.language,
724 grammar.node_types,
725 grammar.tags_query,
726 config,
727 )
728 .unwrap();
729 lang_parser.parse(source, file_path).unwrap()
730 }
731
732 #[test]
733 #[cfg(feature = "group-data")]
734 fn fingerprint_and_child_kinds_emitted_separately() {
735 let schema = parse_with("json", br#"{"a": 1}"#, "test.json");
742
743 let saw_fingerprint = schema.constraints.values().any(|cs| {
744 cs.iter()
745 .any(|c| c.sort.as_ref() == "chose-alt-fingerprint")
746 });
747 let saw_child_kinds = schema.constraints.values().any(|cs| {
748 cs.iter()
749 .any(|c| c.sort.as_ref() == "chose-alt-child-kinds")
750 });
751
752 assert!(
753 saw_fingerprint,
754 "walker must emit chose-alt-fingerprint (literal-token witness)"
755 );
756 assert!(
757 saw_child_kinds,
758 "walker must emit chose-alt-child-kinds (named-kind witness)"
759 );
760 }
761
762 #[test]
763 #[cfg(feature = "group-data")]
764 fn child_kinds_excludes_hidden_rules() {
765 let schema = parse_with("json", br#"{"k": "v"}"#, "test.json");
768
769 for cs in schema.constraints.values() {
770 for c in cs {
771 if c.sort.as_ref() == "chose-alt-child-kinds" {
772 for kind in c.value.split_whitespace() {
773 assert!(
774 !kind.starts_with('_'),
775 "hidden-rule kind '{kind}' must not appear in chose-alt-child-kinds"
776 );
777 }
778 }
779 }
780 }
781 }
782}