panproto_parse/id_scheme.rs
1//! Scope-aware vertex ID generation for full-AST schemas.
2//!
3//! Generates stable, human-readable vertex IDs that encode the scope path
4//! from file to the specific AST node. IDs follow the pattern:
5//!
6//! ```text
7//! src/main.rs::parse_input::$3::$0.left
8//! ```
9//!
10//! where `src/main.rs` is the file, `parse_input` is the function,
11//! `$3` is the 4th statement in the function body, and `$0.left` is
12//! the left operand of the first expression.
13//!
14//! Statement indices are positional within blocks (shift on insertion).
15//! This is correct: the merge algorithm handles reindexing via ThOrder
16//! pushouts (`has_order: true`).
17//!
18//! # Name disambiguation
19//!
20//! Within a single scope frame, the same name can appear multiple
21//! times. The clearest case is Python's `@typing.overload`:
22//!
23//! ```python
24//! @overload
25//! def foo(x: int) -> int: ...
26//! @overload
27//! def foo(x: str) -> str: ...
28//! def foo(x): return x
29//! ```
30//!
31//! The same shape arises for C++ overloads, Rust trait methods of the
32//! same name in different `impl` blocks, Erlang clauses, Lean / Idris
33//! dependent overloads, and any tree-sitter grammar whose `tags.scm`
34//! tags two siblings of the same kind with the same identifier. The
35//! generator deduplicates by suffixing repeat occurrences with `#N`:
36//!
37//! * first occurrence: `…::foo`
38//! * second: `…::foo#1`
39//! * third: `…::foo#2`
40//!
41//! The disambiguated leaf is recorded on the parent frame's `seen`
42//! table so that a subsequent [`IdGenerator::push_named_scope`] for
43//! the same name returns and persists the matching disambiguated
44//! string. Descendants of the second `foo` are prefixed `…::foo#1::…`,
45//! not `…::foo::…`, so two collisions never merge into one schema
46//! vertex transitively.
47//!
48//! Field IDs (the `.field` suffix used for expression-tree children)
49//! share the same disambiguation: a second [`IdGenerator::field_id`]
50//! call at the same base + field name yields `base.field#1`.
51
52use std::collections::HashMap;
53
54/// One scope frame.
55#[derive(Debug)]
56struct ScopeFrame {
57 /// The already-disambiguated scope name. Used verbatim as a
58 /// segment in [`IdGenerator::current_prefix`].
59 name: String,
60 /// Counter for anonymous (`$N`) children of this scope.
61 child_counter: u32,
62 /// Per-name occurrence counts. `seen[n]` is the number of prior
63 /// [`IdGenerator::named_id`] / [`IdGenerator::push_named_scope`]
64 /// calls at this frame for the bare name `n` (before suffixing).
65 /// The next call at index `seen[n]` is suffixed `#seen[n]`
66 /// (0 → no suffix; 1 → `#1`; 2 → `#2`; …).
67 ///
68 /// Field IDs use the same table, keyed by `field:<base>::<name>`,
69 /// so a second `field_id(parent, "args")` on the same parent
70 /// yields `parent.args#1`.
71 seen: HashMap<String, usize>,
72}
73
74/// Generates scope-aware vertex IDs for AST nodes.
75#[derive(Debug)]
76pub struct IdGenerator {
77 /// The scope stack: outermost (file path) first.
78 frames: Vec<ScopeFrame>,
79}
80
81impl IdGenerator {
82 /// Create a new generator rooted at the given file path.
83 #[must_use]
84 pub fn new(file_path: &str) -> Self {
85 Self {
86 frames: vec![ScopeFrame {
87 name: file_path.to_owned(),
88 child_counter: 0,
89 seen: HashMap::new(),
90 }],
91 }
92 }
93
94 /// Record an occurrence of `name` in the current frame and return
95 /// the disambiguated form: `name` for the first occurrence, then
96 /// `name#1`, `name#2`, ….
97 ///
98 /// The current frame is the *parent* of any vertex about to be
99 /// emitted under `name`, so disambiguation is one-per-parent.
100 ///
101 /// Callers that want a vertex ID and a matching scope push (the
102 /// walker's case when a named-scope node is also scope-introducing)
103 /// use this once and then [`Self::push_recorded_scope`] with the
104 /// returned leaf, so the scope-stack frame and the vertex ID agree
105 /// on the suffix.
106 pub fn record_name(&mut self, name: &str) -> String {
107 let Some(frame) = self.frames.last_mut() else {
108 return name.to_owned();
109 };
110 let count = frame.seen.entry(name.to_owned()).or_insert(0);
111 let n = *count;
112 *count += 1;
113 if n == 0 {
114 name.to_owned()
115 } else {
116 format!("{name}#{n}")
117 }
118 }
119
120 /// Push a named scope (function, class, method, module, etc.).
121 ///
122 /// Named scopes appear in the ID as their (disambiguated) name:
123 /// `file.rs::function_name::…`. Returns the disambiguated leaf so
124 /// the caller can use it as the vertex ID for the scope-introducing
125 /// node itself.
126 pub fn push_named_scope(&mut self, name: &str) -> String {
127 let leaf = self.record_name(name);
128 self.frames.push(ScopeFrame {
129 name: leaf.clone(),
130 child_counter: 0,
131 seen: HashMap::new(),
132 });
133 leaf
134 }
135
136 /// Push a scope using a name that was already disambiguated by
137 /// [`Self::record_name`]. The frame's `name` field becomes `leaf`
138 /// verbatim — no further suffixing.
139 ///
140 /// Walker pattern:
141 ///
142 /// ```ignore
143 /// let leaf = id_gen.record_name("foo");
144 /// let vertex_id = format!("{}::{leaf}", id_gen.current_prefix());
145 /// // … emit vertex, edges, constraints …
146 /// id_gen.push_recorded_scope(leaf); // frame = "foo" / "foo#1" / …
147 /// ```
148 pub fn push_recorded_scope(&mut self, leaf: String) {
149 self.frames.push(ScopeFrame {
150 name: leaf,
151 child_counter: 0,
152 seen: HashMap::new(),
153 });
154 }
155
156 /// Push an anonymous scope (block, statement body, etc.) and
157 /// return its positional index.
158 pub fn push_anonymous_scope(&mut self) -> u32 {
159 let Some(parent) = self.frames.last_mut() else {
160 return 0;
161 };
162 let idx = parent.child_counter;
163 parent.child_counter += 1;
164 let name = format!("${idx}");
165 self.frames.push(ScopeFrame {
166 name,
167 child_counter: 0,
168 seen: HashMap::new(),
169 });
170 idx
171 }
172
173 /// Pop the current scope, returning to the parent.
174 ///
175 /// Debug builds assert that the root frame is not popped. Walker
176 /// bugs that mis-pair push and pop surface here rather than as
177 /// mis-shaped IDs down the road.
178 pub fn pop_scope(&mut self) {
179 debug_assert!(
180 self.frames.len() > 1,
181 "IdGenerator::pop_scope called at root; walker push/pop is unbalanced"
182 );
183 if self.frames.len() > 1 {
184 self.frames.pop();
185 }
186 }
187
188 /// Generate an ID for a named node at the current scope level.
189 ///
190 /// Returns the full scope-qualified ID. The second call with the
191 /// same `name` at the same scope returns `name#1`, the third
192 /// `name#2`, and so on. Disambiguation is recorded so a matching
193 /// [`Self::push_named_scope`] call returns the same suffix.
194 pub fn named_id(&mut self, name: &str) -> String {
195 let leaf = self.record_name(name);
196 if self.frames.len() == 1 {
197 format!("{}::{leaf}", self.frames[0].name)
198 } else {
199 format!("{}::{leaf}", self.current_prefix())
200 }
201 }
202
203 /// Generate an ID for an anonymous (positional) node at the
204 /// current scope level.
205 ///
206 /// The index is auto-incremented within the current scope and is
207 /// independent of the named-id occurrence counters, so anonymous
208 /// and named children can interleave at one scope without
209 /// collision.
210 pub fn anonymous_id(&mut self) -> String {
211 let Some(frame) = self.frames.last_mut() else {
212 return String::new();
213 };
214 let idx = frame.child_counter;
215 frame.child_counter += 1;
216 format!("{}::${idx}", self.current_prefix())
217 }
218
219 /// Generate an ID with a field path suffix for expression
220 /// sub-nodes.
221 ///
222 /// Used for expression tree paths like `$3::$0.left` where
223 /// `.left` is the field name within the parent expression.
224 ///
225 /// Repeated calls with the same `base_id` and `field_name` are
226 /// disambiguated by a `#N` suffix, mirroring the named-id scheme.
227 /// Tree-sitter `field('xs', repeat($.X))` shapes produce many
228 /// `args` children under one parent; without disambiguation the
229 /// resulting IDs would collide.
230 pub fn field_id(&mut self, base_id: &str, field_name: &str) -> String {
231 let key = format!("field:{base_id}::{field_name}");
232 let Some(frame) = self.frames.last_mut() else {
233 return format!("{base_id}.{field_name}");
234 };
235 let count = frame.seen.entry(key).or_insert(0);
236 let n = *count;
237 *count += 1;
238 if n == 0 {
239 format!("{base_id}.{field_name}")
240 } else {
241 format!("{base_id}.{field_name}#{n}")
242 }
243 }
244
245 /// Get the current scope prefix (all scope components joined by `::`).
246 #[must_use]
247 pub fn current_prefix(&self) -> String {
248 let mut out = String::new();
249 for (i, frame) in self.frames.iter().enumerate() {
250 if i > 0 {
251 out.push_str("::");
252 }
253 out.push_str(&frame.name);
254 }
255 out
256 }
257
258 /// Get the current scope depth.
259 #[must_use]
260 pub fn depth(&self) -> usize {
261 self.frames.len()
262 }
263
264 /// Reset the anonymous-child counter for the current scope.
265 ///
266 /// Useful when entering a new block within the same scope level.
267 /// The `seen` table for named-id disambiguation is *not* reset;
268 /// resetting it would re-introduce duplicate-vertex bugs when a
269 /// caller intentionally interleaves blocks under one scope.
270 pub fn reset_counter(&mut self) {
271 if let Some(frame) = self.frames.last_mut() {
272 frame.child_counter = 0;
273 }
274 }
275}
276
277#[cfg(test)]
278mod tests {
279 use super::*;
280
281 #[test]
282 fn basic_named_ids() {
283 let mut id_gen = IdGenerator::new("src/main.rs");
284 assert_eq!(id_gen.named_id("User"), "src/main.rs::User");
285 }
286
287 #[test]
288 fn nested_scopes() {
289 let mut id_gen = IdGenerator::new("src/lib.rs");
290 id_gen.push_named_scope("Parser");
291 id_gen.push_named_scope("parse");
292 assert_eq!(
293 id_gen.named_id("config"),
294 "src/lib.rs::Parser::parse::config"
295 );
296 id_gen.pop_scope();
297 assert_eq!(id_gen.named_id("new"), "src/lib.rs::Parser::new");
298 }
299
300 #[test]
301 fn anonymous_ids_increment() {
302 let mut id_gen = IdGenerator::new("test.ts");
303 id_gen.push_named_scope("main");
304
305 let id0 = id_gen.anonymous_id();
306 let id1 = id_gen.anonymous_id();
307 let id2 = id_gen.anonymous_id();
308
309 assert_eq!(id0, "test.ts::main::$0");
310 assert_eq!(id1, "test.ts::main::$1");
311 assert_eq!(id2, "test.ts::main::$2");
312 }
313
314 #[test]
315 fn anonymous_scopes() {
316 let mut id_gen = IdGenerator::new("test.py");
317 id_gen.push_named_scope("process");
318 let _stmt_idx = id_gen.push_anonymous_scope(); // enters $0 scope
319 let inner = id_gen.anonymous_id();
320 assert_eq!(inner, "test.py::process::$0::$0");
321 id_gen.pop_scope();
322 let _stmt_idx2 = id_gen.push_anonymous_scope(); // enters $1 scope
323 let inner2 = id_gen.anonymous_id();
324 assert_eq!(inner2, "test.py::process::$1::$0");
325 }
326
327 #[test]
328 fn field_ids() {
329 let mut id_gen = IdGenerator::new("test.rs");
330 let base = id_gen.named_id("expr");
331 let left = id_gen.field_id(&base, "left");
332 let right = id_gen.field_id(&base, "right");
333 assert_eq!(left, "test.rs::expr.left");
334 assert_eq!(right, "test.rs::expr.right");
335 }
336
337 #[test]
338 fn depth_tracking() {
339 let mut id_gen = IdGenerator::new("f.ts");
340 assert_eq!(id_gen.depth(), 1);
341 id_gen.push_named_scope("fn");
342 assert_eq!(id_gen.depth(), 2);
343 id_gen.push_anonymous_scope();
344 assert_eq!(id_gen.depth(), 3);
345 id_gen.pop_scope();
346 assert_eq!(id_gen.depth(), 2);
347 }
348
349 /// The overload regression: three same-named declarations at the
350 /// same scope produce three distinct IDs.
351 #[test]
352 fn duplicate_named_ids_at_same_scope_get_suffixed() {
353 let mut id_gen = IdGenerator::new("repro.py");
354 assert_eq!(id_gen.named_id("foo"), "repro.py::foo");
355 assert_eq!(id_gen.named_id("foo"), "repro.py::foo#1");
356 assert_eq!(id_gen.named_id("foo"), "repro.py::foo#2");
357 }
358
359 /// Disambiguation must propagate to the scope stack so children of
360 /// the second `foo` are prefixed `foo#1`, not `foo`. Without this,
361 /// `foo::body` from the first overload and `foo::body` from the
362 /// second would still collide in the schema.
363 #[test]
364 fn push_named_scope_uses_disambiguated_leaf() {
365 let mut id_gen = IdGenerator::new("repro.py");
366 let first = id_gen.push_named_scope("foo");
367 assert_eq!(first, "foo");
368 assert_eq!(id_gen.named_id("body"), "repro.py::foo::body");
369 id_gen.pop_scope();
370
371 let second = id_gen.push_named_scope("foo");
372 assert_eq!(second, "foo#1");
373 assert_eq!(id_gen.named_id("body"), "repro.py::foo#1::body");
374 id_gen.pop_scope();
375
376 let third = id_gen.push_named_scope("foo");
377 assert_eq!(third, "foo#2");
378 }
379
380 /// `named_id` and `push_named_scope` share the same `seen` table;
381 /// using one then the other for the same bare name continues the
382 /// suffix sequence.
383 #[test]
384 fn named_id_and_push_share_disambiguation() {
385 let mut id_gen = IdGenerator::new("test.rs");
386 assert_eq!(id_gen.named_id("Foo"), "test.rs::Foo");
387 let leaf = id_gen.push_named_scope("Foo");
388 assert_eq!(leaf, "Foo#1");
389 id_gen.pop_scope();
390 assert_eq!(id_gen.named_id("Foo"), "test.rs::Foo#2");
391 }
392
393 /// Anonymous and named children interleave at the same scope
394 /// without collision.
395 #[test]
396 fn anonymous_and_named_interleave_cleanly() {
397 let mut id_gen = IdGenerator::new("f.rs");
398 let a = id_gen.anonymous_id();
399 let x1 = id_gen.named_id("x");
400 let b = id_gen.anonymous_id();
401 let x2 = id_gen.named_id("x");
402 assert_eq!(a, "f.rs::$0");
403 assert_eq!(x1, "f.rs::x");
404 assert_eq!(b, "f.rs::$1");
405 assert_eq!(x2, "f.rs::x#1");
406 }
407
408 /// Repeated field names at the same parent disambiguate.
409 #[test]
410 fn field_ids_disambiguate_under_repeat() {
411 let mut id_gen = IdGenerator::new("f.qvr");
412 let base = id_gen.named_id("call");
413 let a0 = id_gen.field_id(&base, "args");
414 let a1 = id_gen.field_id(&base, "args");
415 let a2 = id_gen.field_id(&base, "args");
416 assert_eq!(a0, "f.qvr::call.args");
417 assert_eq!(a1, "f.qvr::call.args#1");
418 assert_eq!(a2, "f.qvr::call.args#2");
419 }
420
421 /// Sibling scopes share no disambiguation state. Two functions
422 /// each containing a `foo` produce identical inner IDs.
423 #[test]
424 fn sibling_scopes_have_fresh_seen_tables() {
425 let mut id_gen = IdGenerator::new("f.rs");
426 id_gen.push_named_scope("a");
427 let inner_a = id_gen.named_id("foo");
428 id_gen.pop_scope();
429 id_gen.push_named_scope("b");
430 let inner_b = id_gen.named_id("foo");
431 id_gen.pop_scope();
432 assert_eq!(inner_a, "f.rs::a::foo");
433 assert_eq!(inner_b, "f.rs::b::foo");
434 }
435
436 /// `pop_scope` at the root is a no-op in release; the debug
437 /// assertion catches the bug.
438 #[test]
439 #[cfg_attr(
440 debug_assertions,
441 should_panic(expected = "walker push/pop is unbalanced")
442 )]
443 fn pop_at_root_debug_panics_release_noops() {
444 let mut id_gen = IdGenerator::new("f.rs");
445 id_gen.pop_scope();
446 // Release-mode reaches here; depth still 1.
447 assert_eq!(id_gen.depth(), 1);
448 }
449}