Skip to main content

panproto_parse/
id_scheme.rs

1//! Scope-aware vertex ID generation for full-AST schemas.
2//!
3//! Generates stable, human-readable vertex IDs that encode the scope path
4//! from file to the specific AST node. IDs follow the pattern:
5//!
6//! ```text
7//! src/main.rs::parse_input::$3::$0.left
8//! ```
9//!
10//! where `src/main.rs` is the file, `parse_input` is the function,
11//! `$3` is the 4th statement in the function body, and `$0.left` is
12//! the left operand of the first expression.
13//!
14//! Statement indices are positional within blocks (shift on insertion).
15//! This is correct: the merge algorithm handles reindexing via ThOrder
16//! pushouts (`has_order: true`).
17//!
18//! # Name disambiguation
19//!
20//! Within a single scope frame, the same name can appear multiple
21//! times. The clearest case is Python's `@typing.overload`:
22//!
23//! ```python
24//! @overload
25//! def foo(x: int) -> int: ...
26//! @overload
27//! def foo(x: str) -> str: ...
28//! def foo(x): return x
29//! ```
30//!
31//! The same shape arises for C++ overloads, Rust trait methods of the
32//! same name in different `impl` blocks, Erlang clauses, Lean / Idris
33//! dependent overloads, and any tree-sitter grammar whose `tags.scm`
34//! tags two siblings of the same kind with the same identifier. The
35//! generator deduplicates by suffixing repeat occurrences with `#N`:
36//!
37//! * first occurrence: `…::foo`
38//! * second:           `…::foo#1`
39//! * third:            `…::foo#2`
40//!
41//! The disambiguated leaf is recorded on the parent frame's `seen`
42//! table so that a subsequent [`IdGenerator::push_named_scope`] for
43//! the same name returns and persists the matching disambiguated
44//! string. Descendants of the second `foo` are prefixed `…::foo#1::…`,
45//! not `…::foo::…`, so two collisions never merge into one schema
46//! vertex transitively.
47//!
48//! Field IDs (the `.field` suffix used for expression-tree children)
49//! share the same disambiguation: a second [`IdGenerator::field_id`]
50//! call at the same base + field name yields `base.field#1`.
51
52use std::collections::HashMap;
53
54/// One scope frame.
55#[derive(Debug)]
56struct ScopeFrame {
57    /// The already-disambiguated scope name. Used verbatim as a
58    /// segment in [`IdGenerator::current_prefix`].
59    name: String,
60    /// Counter for anonymous (`$N`) children of this scope.
61    child_counter: u32,
62    /// Per-name occurrence counts. `seen[n]` is the number of prior
63    /// [`IdGenerator::named_id`] / [`IdGenerator::push_named_scope`]
64    /// calls at this frame for the bare name `n` (before suffixing).
65    /// The next call at index `seen[n]` is suffixed `#seen[n]`
66    /// (0 → no suffix; 1 → `#1`; 2 → `#2`; …).
67    ///
68    /// Field IDs use the same table, keyed by `field:<base>::<name>`,
69    /// so a second `field_id(parent, "args")` on the same parent
70    /// yields `parent.args#1`.
71    seen: HashMap<String, usize>,
72}
73
74/// Generates scope-aware vertex IDs for AST nodes.
75#[derive(Debug)]
76pub struct IdGenerator {
77    /// The scope stack: outermost (file path) first.
78    frames: Vec<ScopeFrame>,
79}
80
81impl IdGenerator {
82    /// Create a new generator rooted at the given file path.
83    #[must_use]
84    pub fn new(file_path: &str) -> Self {
85        Self {
86            frames: vec![ScopeFrame {
87                name: file_path.to_owned(),
88                child_counter: 0,
89                seen: HashMap::new(),
90            }],
91        }
92    }
93
94    /// Record an occurrence of `name` in the current frame and return
95    /// the disambiguated form: `name` for the first occurrence, then
96    /// `name#1`, `name#2`, ….
97    ///
98    /// The current frame is the *parent* of any vertex about to be
99    /// emitted under `name`, so disambiguation is one-per-parent.
100    ///
101    /// Callers that want a vertex ID and a matching scope push (the
102    /// walker's case when a named-scope node is also scope-introducing)
103    /// use this once and then [`Self::push_recorded_scope`] with the
104    /// returned leaf, so the scope-stack frame and the vertex ID agree
105    /// on the suffix.
106    pub fn record_name(&mut self, name: &str) -> String {
107        let Some(frame) = self.frames.last_mut() else {
108            return name.to_owned();
109        };
110        let count = frame.seen.entry(name.to_owned()).or_insert(0);
111        let n = *count;
112        *count += 1;
113        if n == 0 {
114            name.to_owned()
115        } else {
116            format!("{name}#{n}")
117        }
118    }
119
120    /// Push a named scope (function, class, method, module, etc.).
121    ///
122    /// Named scopes appear in the ID as their (disambiguated) name:
123    /// `file.rs::function_name::…`. Returns the disambiguated leaf so
124    /// the caller can use it as the vertex ID for the scope-introducing
125    /// node itself.
126    pub fn push_named_scope(&mut self, name: &str) -> String {
127        let leaf = self.record_name(name);
128        self.frames.push(ScopeFrame {
129            name: leaf.clone(),
130            child_counter: 0,
131            seen: HashMap::new(),
132        });
133        leaf
134    }
135
136    /// Push a scope using a name that was already disambiguated by
137    /// [`Self::record_name`]. The frame's `name` field becomes `leaf`
138    /// verbatim — no further suffixing.
139    ///
140    /// Walker pattern:
141    ///
142    /// ```ignore
143    /// let leaf = id_gen.record_name("foo");
144    /// let vertex_id = format!("{}::{leaf}", id_gen.current_prefix());
145    /// // … emit vertex, edges, constraints …
146    /// id_gen.push_recorded_scope(leaf);   // frame = "foo" / "foo#1" / …
147    /// ```
148    pub fn push_recorded_scope(&mut self, leaf: String) {
149        self.frames.push(ScopeFrame {
150            name: leaf,
151            child_counter: 0,
152            seen: HashMap::new(),
153        });
154    }
155
156    /// Push an anonymous scope (block, statement body, etc.) and
157    /// return its positional index.
158    pub fn push_anonymous_scope(&mut self) -> u32 {
159        let Some(parent) = self.frames.last_mut() else {
160            return 0;
161        };
162        let idx = parent.child_counter;
163        parent.child_counter += 1;
164        let name = format!("${idx}");
165        self.frames.push(ScopeFrame {
166            name,
167            child_counter: 0,
168            seen: HashMap::new(),
169        });
170        idx
171    }
172
173    /// Pop the current scope, returning to the parent.
174    ///
175    /// Debug builds assert that the root frame is not popped. Walker
176    /// bugs that mis-pair push and pop surface here rather than as
177    /// mis-shaped IDs down the road.
178    pub fn pop_scope(&mut self) {
179        debug_assert!(
180            self.frames.len() > 1,
181            "IdGenerator::pop_scope called at root; walker push/pop is unbalanced"
182        );
183        if self.frames.len() > 1 {
184            self.frames.pop();
185        }
186    }
187
188    /// Generate an ID for a named node at the current scope level.
189    ///
190    /// Returns the full scope-qualified ID. The second call with the
191    /// same `name` at the same scope returns `name#1`, the third
192    /// `name#2`, and so on. Disambiguation is recorded so a matching
193    /// [`Self::push_named_scope`] call returns the same suffix.
194    pub fn named_id(&mut self, name: &str) -> String {
195        let leaf = self.record_name(name);
196        if self.frames.len() == 1 {
197            format!("{}::{leaf}", self.frames[0].name)
198        } else {
199            format!("{}::{leaf}", self.current_prefix())
200        }
201    }
202
203    /// Generate an ID for an anonymous (positional) node at the
204    /// current scope level.
205    ///
206    /// The index is auto-incremented within the current scope and is
207    /// independent of the named-id occurrence counters, so anonymous
208    /// and named children can interleave at one scope without
209    /// collision.
210    pub fn anonymous_id(&mut self) -> String {
211        let Some(frame) = self.frames.last_mut() else {
212            return String::new();
213        };
214        let idx = frame.child_counter;
215        frame.child_counter += 1;
216        format!("{}::${idx}", self.current_prefix())
217    }
218
219    /// Generate an ID with a field path suffix for expression
220    /// sub-nodes.
221    ///
222    /// Used for expression tree paths like `$3::$0.left` where
223    /// `.left` is the field name within the parent expression.
224    ///
225    /// Repeated calls with the same `base_id` and `field_name` are
226    /// disambiguated by a `#N` suffix, mirroring the named-id scheme.
227    /// Tree-sitter `field('xs', repeat($.X))` shapes produce many
228    /// `args` children under one parent; without disambiguation the
229    /// resulting IDs would collide.
230    pub fn field_id(&mut self, base_id: &str, field_name: &str) -> String {
231        let key = format!("field:{base_id}::{field_name}");
232        let Some(frame) = self.frames.last_mut() else {
233            return format!("{base_id}.{field_name}");
234        };
235        let count = frame.seen.entry(key).or_insert(0);
236        let n = *count;
237        *count += 1;
238        if n == 0 {
239            format!("{base_id}.{field_name}")
240        } else {
241            format!("{base_id}.{field_name}#{n}")
242        }
243    }
244
245    /// Get the current scope prefix (all scope components joined by `::`).
246    #[must_use]
247    pub fn current_prefix(&self) -> String {
248        let mut out = String::new();
249        for (i, frame) in self.frames.iter().enumerate() {
250            if i > 0 {
251                out.push_str("::");
252            }
253            out.push_str(&frame.name);
254        }
255        out
256    }
257
258    /// Get the current scope depth.
259    #[must_use]
260    pub fn depth(&self) -> usize {
261        self.frames.len()
262    }
263
264    /// Reset the anonymous-child counter for the current scope.
265    ///
266    /// Useful when entering a new block within the same scope level.
267    /// The `seen` table for named-id disambiguation is *not* reset;
268    /// resetting it would re-introduce duplicate-vertex bugs when a
269    /// caller intentionally interleaves blocks under one scope.
270    pub fn reset_counter(&mut self) {
271        if let Some(frame) = self.frames.last_mut() {
272            frame.child_counter = 0;
273        }
274    }
275}
276
277#[cfg(test)]
278mod tests {
279    use super::*;
280
281    #[test]
282    fn basic_named_ids() {
283        let mut id_gen = IdGenerator::new("src/main.rs");
284        assert_eq!(id_gen.named_id("User"), "src/main.rs::User");
285    }
286
287    #[test]
288    fn nested_scopes() {
289        let mut id_gen = IdGenerator::new("src/lib.rs");
290        id_gen.push_named_scope("Parser");
291        id_gen.push_named_scope("parse");
292        assert_eq!(
293            id_gen.named_id("config"),
294            "src/lib.rs::Parser::parse::config"
295        );
296        id_gen.pop_scope();
297        assert_eq!(id_gen.named_id("new"), "src/lib.rs::Parser::new");
298    }
299
300    #[test]
301    fn anonymous_ids_increment() {
302        let mut id_gen = IdGenerator::new("test.ts");
303        id_gen.push_named_scope("main");
304
305        let id0 = id_gen.anonymous_id();
306        let id1 = id_gen.anonymous_id();
307        let id2 = id_gen.anonymous_id();
308
309        assert_eq!(id0, "test.ts::main::$0");
310        assert_eq!(id1, "test.ts::main::$1");
311        assert_eq!(id2, "test.ts::main::$2");
312    }
313
314    #[test]
315    fn anonymous_scopes() {
316        let mut id_gen = IdGenerator::new("test.py");
317        id_gen.push_named_scope("process");
318        let _stmt_idx = id_gen.push_anonymous_scope(); // enters $0 scope
319        let inner = id_gen.anonymous_id();
320        assert_eq!(inner, "test.py::process::$0::$0");
321        id_gen.pop_scope();
322        let _stmt_idx2 = id_gen.push_anonymous_scope(); // enters $1 scope
323        let inner2 = id_gen.anonymous_id();
324        assert_eq!(inner2, "test.py::process::$1::$0");
325    }
326
327    #[test]
328    fn field_ids() {
329        let mut id_gen = IdGenerator::new("test.rs");
330        let base = id_gen.named_id("expr");
331        let left = id_gen.field_id(&base, "left");
332        let right = id_gen.field_id(&base, "right");
333        assert_eq!(left, "test.rs::expr.left");
334        assert_eq!(right, "test.rs::expr.right");
335    }
336
337    #[test]
338    fn depth_tracking() {
339        let mut id_gen = IdGenerator::new("f.ts");
340        assert_eq!(id_gen.depth(), 1);
341        id_gen.push_named_scope("fn");
342        assert_eq!(id_gen.depth(), 2);
343        id_gen.push_anonymous_scope();
344        assert_eq!(id_gen.depth(), 3);
345        id_gen.pop_scope();
346        assert_eq!(id_gen.depth(), 2);
347    }
348
349    /// The overload regression: three same-named declarations at the
350    /// same scope produce three distinct IDs.
351    #[test]
352    fn duplicate_named_ids_at_same_scope_get_suffixed() {
353        let mut id_gen = IdGenerator::new("repro.py");
354        assert_eq!(id_gen.named_id("foo"), "repro.py::foo");
355        assert_eq!(id_gen.named_id("foo"), "repro.py::foo#1");
356        assert_eq!(id_gen.named_id("foo"), "repro.py::foo#2");
357    }
358
359    /// Disambiguation must propagate to the scope stack so children of
360    /// the second `foo` are prefixed `foo#1`, not `foo`. Without this,
361    /// `foo::body` from the first overload and `foo::body` from the
362    /// second would still collide in the schema.
363    #[test]
364    fn push_named_scope_uses_disambiguated_leaf() {
365        let mut id_gen = IdGenerator::new("repro.py");
366        let first = id_gen.push_named_scope("foo");
367        assert_eq!(first, "foo");
368        assert_eq!(id_gen.named_id("body"), "repro.py::foo::body");
369        id_gen.pop_scope();
370
371        let second = id_gen.push_named_scope("foo");
372        assert_eq!(second, "foo#1");
373        assert_eq!(id_gen.named_id("body"), "repro.py::foo#1::body");
374        id_gen.pop_scope();
375
376        let third = id_gen.push_named_scope("foo");
377        assert_eq!(third, "foo#2");
378    }
379
380    /// `named_id` and `push_named_scope` share the same `seen` table;
381    /// using one then the other for the same bare name continues the
382    /// suffix sequence.
383    #[test]
384    fn named_id_and_push_share_disambiguation() {
385        let mut id_gen = IdGenerator::new("test.rs");
386        assert_eq!(id_gen.named_id("Foo"), "test.rs::Foo");
387        let leaf = id_gen.push_named_scope("Foo");
388        assert_eq!(leaf, "Foo#1");
389        id_gen.pop_scope();
390        assert_eq!(id_gen.named_id("Foo"), "test.rs::Foo#2");
391    }
392
393    /// Anonymous and named children interleave at the same scope
394    /// without collision.
395    #[test]
396    fn anonymous_and_named_interleave_cleanly() {
397        let mut id_gen = IdGenerator::new("f.rs");
398        let a = id_gen.anonymous_id();
399        let x1 = id_gen.named_id("x");
400        let b = id_gen.anonymous_id();
401        let x2 = id_gen.named_id("x");
402        assert_eq!(a, "f.rs::$0");
403        assert_eq!(x1, "f.rs::x");
404        assert_eq!(b, "f.rs::$1");
405        assert_eq!(x2, "f.rs::x#1");
406    }
407
408    /// Repeated field names at the same parent disambiguate.
409    #[test]
410    fn field_ids_disambiguate_under_repeat() {
411        let mut id_gen = IdGenerator::new("f.qvr");
412        let base = id_gen.named_id("call");
413        let a0 = id_gen.field_id(&base, "args");
414        let a1 = id_gen.field_id(&base, "args");
415        let a2 = id_gen.field_id(&base, "args");
416        assert_eq!(a0, "f.qvr::call.args");
417        assert_eq!(a1, "f.qvr::call.args#1");
418        assert_eq!(a2, "f.qvr::call.args#2");
419    }
420
421    /// Sibling scopes share no disambiguation state. Two functions
422    /// each containing a `foo` produce identical inner IDs.
423    #[test]
424    fn sibling_scopes_have_fresh_seen_tables() {
425        let mut id_gen = IdGenerator::new("f.rs");
426        id_gen.push_named_scope("a");
427        let inner_a = id_gen.named_id("foo");
428        id_gen.pop_scope();
429        id_gen.push_named_scope("b");
430        let inner_b = id_gen.named_id("foo");
431        id_gen.pop_scope();
432        assert_eq!(inner_a, "f.rs::a::foo");
433        assert_eq!(inner_b, "f.rs::b::foo");
434    }
435
436    /// `pop_scope` at the root is a no-op in release; the debug
437    /// assertion catches the bug.
438    #[test]
439    #[cfg_attr(
440        debug_assertions,
441        should_panic(expected = "walker push/pop is unbalanced")
442    )]
443    fn pop_at_root_debug_panics_release_noops() {
444        let mut id_gen = IdGenerator::new("f.rs");
445        id_gen.pop_scope();
446        // Release-mode reaches here; depth still 1.
447        assert_eq!(id_gen.depth(), 1);
448    }
449}