1use panproto_schema::{Protocol, Schema, SchemaBuilder};
9use rustc_hash::FxHashSet;
10
11use crate::error::ParseError;
12use crate::id_scheme::IdGenerator;
13use crate::theory_extract::ExtractedTheoryMeta;
14
15const SCOPE_INTRODUCING_KINDS: &[&str] = &[
20 "function_declaration",
21 "function_definition",
22 "method_declaration",
23 "method_definition",
24 "class_declaration",
25 "class_definition",
26 "interface_declaration",
27 "struct_item",
28 "enum_item",
29 "enum_declaration",
30 "impl_item",
31 "trait_item",
32 "module",
33 "namespace_definition",
34 "package_declaration",
35];
36
37const BLOCK_KINDS: &[&str] = &[
39 "block",
40 "statement_block",
41 "compound_statement",
42 "declaration_list",
43 "field_declaration_list",
44 "enum_body",
45 "class_body",
46 "interface_body",
47 "module_body",
48];
49
50#[derive(Debug, Clone)]
52pub struct WalkerConfig {
53 pub extra_scope_kinds: Vec<String>,
55 pub extra_block_kinds: Vec<String>,
57 pub name_fields: Vec<String>,
60 pub capture_comments: bool,
62 pub capture_formatting: bool,
64}
65
66impl Default for WalkerConfig {
67 fn default() -> Self {
68 Self {
69 extra_scope_kinds: Vec::new(),
70 extra_block_kinds: Vec::new(),
71 name_fields: vec!["name".to_owned(), "identifier".to_owned()],
72 capture_comments: true,
73 capture_formatting: true,
74 }
75 }
76}
77
78pub struct AstWalker<'a> {
83 source: &'a [u8],
85 theory_meta: &'a ExtractedTheoryMeta,
89 protocol: &'a Protocol,
91 config: WalkerConfig,
93 scope_kinds: FxHashSet<String>,
95 block_kinds: FxHashSet<String>,
97}
98
99impl<'a> AstWalker<'a> {
100 #[must_use]
102 pub fn new(
103 source: &'a [u8],
104 theory_meta: &'a ExtractedTheoryMeta,
105 protocol: &'a Protocol,
106 config: WalkerConfig,
107 ) -> Self {
108 let mut scope_kinds: FxHashSet<String> = SCOPE_INTRODUCING_KINDS
109 .iter()
110 .map(|s| (*s).to_owned())
111 .collect();
112 for kind in &config.extra_scope_kinds {
113 scope_kinds.insert(kind.clone());
114 }
115
116 let mut block_kinds: FxHashSet<String> =
117 BLOCK_KINDS.iter().map(|s| (*s).to_owned()).collect();
118 for kind in &config.extra_block_kinds {
119 block_kinds.insert(kind.clone());
120 }
121
122 Self {
123 source,
124 theory_meta,
125 protocol,
126 config,
127 scope_kinds,
128 block_kinds,
129 }
130 }
131
132 pub fn walk(&self, tree: &tree_sitter::Tree, file_path: &str) -> Result<Schema, ParseError> {
138 let mut id_gen = IdGenerator::new(file_path);
139 let builder = SchemaBuilder::new(self.protocol);
140 let root = tree.root_node();
141
142 let builder = self.walk_node(root, builder, &mut id_gen, None)?;
143
144 builder.build().map_err(|e| ParseError::SchemaConstruction {
145 reason: e.to_string(),
146 })
147 }
148
149 fn walk_node(
151 &self,
152 node: tree_sitter::Node<'_>,
153 mut builder: SchemaBuilder,
154 id_gen: &mut IdGenerator,
155 parent_vertex_id: Option<&str>,
156 ) -> Result<SchemaBuilder, ParseError> {
157 if !node.is_named() {
159 return Ok(builder);
160 }
161
162 let kind = node.kind();
163
164 let is_root_wrapper = parent_vertex_id.is_none()
167 && (kind == "program"
168 || kind == "source_file"
169 || kind == "module"
170 || kind == "translation_unit");
171
172 let vertex_id = if is_root_wrapper {
174 id_gen.current_prefix()
176 } else if self.scope_kinds.contains(kind) {
177 let name = self.extract_scope_name(&node);
179 match name {
180 Some(n) => id_gen.named_id(&n),
181 None => id_gen.anonymous_id(),
182 }
183 } else {
184 id_gen.anonymous_id()
186 };
187
188 let effective_kind = if self.protocol.obj_kinds.is_empty() {
192 kind
194 } else if self.protocol.obj_kinds.iter().any(|k| k == kind) {
195 kind
196 } else if !self.theory_meta.vertex_kinds.is_empty()
197 && self.theory_meta.vertex_kinds.iter().any(|k| k == kind)
198 {
199 kind
201 } else {
202 "node"
203 };
204
205 builder = builder
206 .vertex(&vertex_id, effective_kind, None)
207 .map_err(|e| ParseError::SchemaConstruction {
208 reason: format!("vertex '{vertex_id}' ({kind}): {e}"),
209 })?;
210
211 if let Some(parent_id) = parent_vertex_id {
213 let edge_kind = node
216 .parent()
217 .and_then(|p| {
218 for i in 0..p.child_count() {
220 if let Some(child) = p.child(i) {
221 if child.id() == node.id() {
222 return u32::try_from(i)
223 .ok()
224 .and_then(|idx| p.field_name_for_child(idx));
225 }
226 }
227 }
228 None
229 })
230 .unwrap_or("child_of");
231
232 builder = builder
233 .edge(parent_id, &vertex_id, edge_kind, None)
234 .map_err(|e| ParseError::SchemaConstruction {
235 reason: format!("edge {parent_id} -> {vertex_id} ({edge_kind}): {e}"),
236 })?;
237 }
238
239 builder = builder.constraint(&vertex_id, "start-byte", &node.start_byte().to_string());
241 builder = builder.constraint(&vertex_id, "end-byte", &node.end_byte().to_string());
242
243 if node.named_child_count() == 0 {
245 if let Ok(text) = node.utf8_text(self.source) {
246 builder = builder.constraint(&vertex_id, "literal-value", text);
247 }
248 }
249
250 if self.config.capture_formatting {
252 builder = self.emit_formatting_constraints(node, &vertex_id, builder);
253 }
254
255 let entered_scope = if self.scope_kinds.contains(kind) && !is_root_wrapper {
257 match self.extract_scope_name(&node) {
258 Some(n) => id_gen.push_named_scope(&n),
259 None => {
260 id_gen.push_anonymous_scope();
261 }
262 }
263 true
264 } else if self.block_kinds.contains(kind) {
265 id_gen.push_anonymous_scope();
266 true
267 } else {
268 false
269 };
270
271 builder = self.walk_children_with_interstitials(node, builder, id_gen, &vertex_id)?;
272
273 if entered_scope {
274 id_gen.pop_scope();
275 }
276
277 Ok(builder)
278 }
279
280 fn walk_children_with_interstitials(
282 &self,
283 node: tree_sitter::Node<'_>,
284 mut builder: SchemaBuilder,
285 id_gen: &mut IdGenerator,
286 vertex_id: &str,
287 ) -> Result<SchemaBuilder, ParseError> {
288 let cursor = &mut node.walk();
289 let children: Vec<_> = node.named_children(cursor).collect();
290 let mut interstitial_idx = 0;
291 let mut prev_end = node.start_byte();
292
293 for child in &children {
294 let gap_start = prev_end;
295 let gap_end = child.start_byte();
296 builder = self.capture_interstitial(
297 builder,
298 vertex_id,
299 gap_start,
300 gap_end,
301 &mut interstitial_idx,
302 );
303 builder = self.walk_node(*child, builder, id_gen, Some(vertex_id))?;
304 prev_end = child.end_byte();
305 }
306
307 builder = self.capture_interstitial(
309 builder,
310 vertex_id,
311 prev_end,
312 node.end_byte(),
313 &mut interstitial_idx,
314 );
315
316 Ok(builder)
317 }
318
319 fn capture_interstitial(
321 &self,
322 mut builder: SchemaBuilder,
323 vertex_id: &str,
324 gap_start: usize,
325 gap_end: usize,
326 idx: &mut usize,
327 ) -> SchemaBuilder {
328 if gap_end > gap_start && gap_end <= self.source.len() {
329 if let Ok(gap_text) = std::str::from_utf8(&self.source[gap_start..gap_end]) {
330 if !gap_text.is_empty() {
331 let sort = format!("interstitial-{}", *idx);
332 builder = builder.constraint(vertex_id, &sort, gap_text);
333 builder = builder.constraint(
334 vertex_id,
335 &format!("{sort}-start-byte"),
336 &gap_start.to_string(),
337 );
338 *idx += 1;
339 }
340 }
341 }
342 builder
343 }
344
345 fn extract_scope_name(&self, node: &tree_sitter::Node<'_>) -> Option<String> {
347 for field_name in &self.config.name_fields {
348 if let Some(name_node) = node.child_by_field_name(field_name.as_bytes()) {
349 if let Ok(text) = name_node.utf8_text(self.source) {
350 return Some(text.to_owned());
351 }
352 }
353 }
354 None
355 }
356
357 fn emit_formatting_constraints(
359 &self,
360 node: tree_sitter::Node<'_>,
361 vertex_id: &str,
362 mut builder: SchemaBuilder,
363 ) -> SchemaBuilder {
364 let start = node.start_position();
365
366 if start.column > 0 {
368 let line_start = node.start_byte().saturating_sub(start.column);
370 if line_start < self.source.len() {
371 let indent_end = line_start + start.column.min(self.source.len() - line_start);
372 if let Ok(indent) = std::str::from_utf8(&self.source[line_start..indent_end]) {
373 if !indent.is_empty() && indent.trim().is_empty() {
375 builder = builder.constraint(vertex_id, "indent", indent);
376 }
377 }
378 }
379 }
380
381 if let Some(prev) = node.prev_named_sibling() {
384 let gap_start = prev.end_byte();
385 let gap_end = node.start_byte();
386 if gap_start < gap_end && gap_end <= self.source.len() {
387 let gap = &self.source[gap_start..gap_end];
388 let blank_lines = memchr::memchr_iter(b'\n', gap).count().saturating_sub(1);
389 if blank_lines > 0 {
390 builder = builder.constraint(
391 vertex_id,
392 "blank-lines-before",
393 &blank_lines.to_string(),
394 );
395 }
396 }
397 }
398
399 builder
400 }
401}
402
403#[cfg(test)]
404#[allow(clippy::unwrap_used)]
405mod tests {
406 use super::*;
407
408 fn make_test_protocol() -> Protocol {
409 Protocol {
410 name: "test".into(),
411 schema_theory: "ThTest".into(),
412 instance_theory: "ThTestInst".into(),
413 schema_composition: None,
414 instance_composition: None,
415 obj_kinds: vec![], edge_rules: vec![],
417 constraint_sorts: vec![],
418 has_order: true,
419 has_coproducts: false,
420 has_recursion: false,
421 has_causal: false,
422 nominal_identity: false,
423 has_defaults: false,
424 has_coercions: false,
425 has_mergers: false,
426 has_policies: false,
427 }
428 }
429
430 fn make_test_meta() -> ExtractedTheoryMeta {
431 use panproto_gat::{Sort, Theory};
432 ExtractedTheoryMeta {
433 theory: Theory::new("ThTest", vec![Sort::simple("Vertex")], vec![], vec![]),
434 supertypes: FxHashSet::default(),
435 subtype_map: Vec::new(),
436 optional_fields: FxHashSet::default(),
437 ordered_fields: FxHashSet::default(),
438 vertex_kinds: Vec::new(),
439 edge_kinds: Vec::new(),
440 }
441 }
442
443 #[cfg(feature = "grammars")]
445 fn get_language(name: &str) -> tree_sitter::Language {
446 panproto_grammars::grammars()
447 .into_iter()
448 .find(|g| g.name == name)
449 .unwrap_or_else(|| panic!("grammar '{name}' not enabled in features"))
450 .language
451 }
452
453 #[test]
454 #[cfg(feature = "grammars")]
455 fn walk_simple_typescript() {
456 let source = b"function greet(name: string): string { return name; }";
457
458 let mut parser = tree_sitter::Parser::new();
459 parser.set_language(&get_language("typescript")).unwrap();
460 let tree = parser.parse(source, None).unwrap();
461
462 let protocol = make_test_protocol();
463 let meta = make_test_meta();
464 let walker = AstWalker::new(source, &meta, &protocol, WalkerConfig::default());
465
466 let schema = walker.walk(&tree, "test.ts").unwrap();
467
468 assert!(
470 schema.vertices.len() > 1,
471 "expected multiple vertices, got {}",
472 schema.vertices.len()
473 );
474
475 let root_name: panproto_gat::Name = "test.ts".into();
477 assert!(
478 schema.vertices.contains_key(&root_name),
479 "missing root vertex"
480 );
481 }
482
483 #[test]
484 #[cfg(feature = "grammars")]
485 fn walk_simple_python() {
486 let source = b"def add(a, b):\n return a + b\n";
487
488 let mut parser = tree_sitter::Parser::new();
489 parser.set_language(&get_language("python")).unwrap();
490 let tree = parser.parse(source, None).unwrap();
491
492 let protocol = make_test_protocol();
493 let meta = make_test_meta();
494 let walker = AstWalker::new(source, &meta, &protocol, WalkerConfig::default());
495
496 let schema = walker.walk(&tree, "test.py").unwrap();
497
498 assert!(
499 schema.vertices.len() > 1,
500 "expected multiple vertices, got {}",
501 schema.vertices.len()
502 );
503 }
504
505 #[test]
506 #[cfg(feature = "grammars")]
507 fn walk_simple_rust() {
508 let source = b"fn main() { let x = 42; println!(\"{}\", x); }";
509
510 let mut parser = tree_sitter::Parser::new();
511 parser.set_language(&get_language("rust")).unwrap();
512 let tree = parser.parse(source, None).unwrap();
513
514 let protocol = make_test_protocol();
515 let meta = make_test_meta();
516 let walker = AstWalker::new(source, &meta, &protocol, WalkerConfig::default());
517
518 let schema = walker.walk(&tree, "test.rs").unwrap();
519
520 assert!(
521 schema.vertices.len() > 1,
522 "expected multiple vertices, got {}",
523 schema.vertices.len()
524 );
525 }
526
527 #[cfg(feature = "group-data")]
529 fn assert_roundtrip(grammar_name: &str, source: &[u8], file_path: &str) {
530 use crate::registry::AstParser;
531 let grammar = panproto_grammars::grammars()
532 .into_iter()
533 .find(|g| g.name == grammar_name)
534 .unwrap_or_else(|| panic!("grammar '{grammar_name}' not enabled"));
535
536 let config = crate::languages::walker_configs::walker_config_for(grammar_name);
537 let lang_parser = crate::languages::common::LanguageParser::from_language(
538 grammar_name,
539 grammar.extensions.to_vec(),
540 grammar.language,
541 grammar.node_types,
542 config,
543 )
544 .unwrap();
545
546 let schema = lang_parser.parse(source, file_path).unwrap();
547 let emitted = lang_parser.emit(&schema).unwrap();
548
549 assert_eq!(
550 std::str::from_utf8(source).unwrap(),
551 std::str::from_utf8(&emitted).unwrap(),
552 "round-trip failed for {grammar_name}: emitted bytes differ from source"
553 );
554 }
555
556 #[test]
557 #[cfg(feature = "group-data")]
558 fn roundtrip_json_simple() {
559 assert_roundtrip("json", br#"{"name": "test", "value": 42}"#, "test.json");
560 }
561
562 #[test]
563 #[cfg(feature = "group-data")]
564 fn roundtrip_json_formatted() {
565 let source =
566 b"{\n \"name\": \"test\",\n \"value\": 42,\n \"nested\": {\n \"a\": true\n }\n}";
567 assert_roundtrip("json", source, "test.json");
568 }
569
570 #[test]
571 #[cfg(feature = "group-data")]
572 fn roundtrip_json_array() {
573 let source = b"[\n 1,\n 2,\n 3\n]";
574 assert_roundtrip("json", source, "test.json");
575 }
576
577 #[test]
578 #[cfg(feature = "group-data")]
579 fn roundtrip_xml_simple() {
580 let source = b"<root>\n <child attr=\"val\">text</child>\n</root>";
581 assert_roundtrip("xml", source, "test.xml");
582 }
583
584 #[test]
585 #[cfg(feature = "group-data")]
586 fn roundtrip_yaml_simple() {
587 let source = b"name: test\nvalue: 42\nnested:\n a: true\n";
588 assert_roundtrip("yaml", source, "test.yaml");
589 }
590
591 #[test]
592 #[cfg(feature = "group-data")]
593 fn roundtrip_toml_simple() {
594 let source = b"[package]\nname = \"test\"\nversion = \"0.1.0\"\n";
595 assert_roundtrip("toml", source, "test.toml");
596 }
597}