Skip to main content

lvz_context/
symbols.rs

1//! Recursive symbol-dependency tracking (§6.1) — the graph that makes the
2//! skeleton-radius knob `N` real: "include full bodies for symbols within `N` dependency
3//! hops of the edit target."
4//!
5//! Edges are **resolved from the parse tree**, not from raw text. For each symbol we walk its
6//! subtree and collect the *reference identifiers* it uses (real `identifier`/`type_identifier`
7//! nodes), minus the identifiers it binds locally (parameters, `let`/variable patterns); an edge
8//! `A → B` exists when one of `A`'s references names a defined symbol `B`. Resolving through the
9//! AST means a name that only appears in a **string or comment** no longer creates a spurious
10//! edge, and a **local variable that shadows** a top-level symbol's name no longer links to it.
11//!
12//! Resolution is **scope-aware across files**: a reference is linked to a *same-file* definition
13//! whenever one exists, and only falls back to a cross-file definition (by name) when the name is
14//! not defined locally. So when two files each define a symbol of the same name, a caller links to
15//! *its own* file's definition instead of silently merging with the unrelated far one — the common
16//! same-name collision. (The budget loop relies on name-based cross-file linking for genuinely
17//! cross-file edges — e.g. a refactor where the callee lives in another file with no `use` — so that
18//! fallback is preserved.)
19//!
20//! It is still name-keyed on the *source* side, not a full module-qualified semantic index — two
21//! same-named definitions still share one out-edge key, and there is no `use`/`import` path
22//! resolution — but that is all the radius knob needs, and it stays fully deterministic, exactly
23//! what the §6.5 budget-fixture loop requires.
24
25use std::collections::{HashMap, HashSet};
26
27use tree_sitter::Node;
28
29use crate::lang::Lang;
30use crate::skeleton::{self, parse};
31
32/// A defined symbol's identity: the file it lives in plus its name. Same-named symbols in
33/// different files are **distinct** nodes (so a caller links to its own file's definition, not an
34/// unrelated far one) — the scope-awareness a flat name-keyed graph lacked.
35type Sym = (usize, String);
36
37/// A directed reference graph over file-scoped symbols: `(file, name) → the symbols it references`.
38#[derive(Debug, Default, Clone)]
39pub struct SymbolGraph {
40    edges: HashMap<Sym, HashSet<Sym>>,
41    file_count: usize,
42}
43
44impl SymbolGraph {
45    /// Build a graph from a single source file.
46    pub fn from_source(source: &str, lang: Lang) -> Self {
47        let mut defs = Vec::new();
48        collect_symbol_refs(source, lang, &mut defs);
49        SymbolGraph::link(vec![defs])
50    }
51
52    /// Build a graph spanning several files (e.g. a cross-file refactor fixture). A reference is
53    /// resolved to a *same-file* definition first; a cross-file edge (by name) forms only when the
54    /// name is not defined in the referencing file — so a call to a same-named local symbol no
55    /// longer merges with an unrelated definition elsewhere, while genuinely cross-file edges (the
56    /// budget loop's refactor case) still link.
57    pub fn from_sources<'a>(sources: impl IntoIterator<Item = (Lang, &'a str)>) -> Self {
58        // First pass: per file, per symbol, its name + the names it *references* (resolved from
59        // identifier nodes, minus its own locals). Second pass ([`link`]) resolves those names to
60        // file-scoped symbols, preferring the same file.
61        let per_file: Vec<Vec<(String, HashSet<String>)>> = sources
62            .into_iter()
63            .map(|(lang, source)| {
64                let mut defs = Vec::new();
65                collect_symbol_refs(source, lang, &mut defs);
66                defs
67            })
68            .collect();
69        SymbolGraph::link(per_file)
70    }
71
72    /// Resolve each symbol's referenced names to file-scoped target symbols. A name defined in the
73    /// **same file** resolves there (the precise, scope-aware case); otherwise it resolves to every
74    /// other file that defines it (the cross-file fallback — kept because the budget loop links
75    /// across files that share no `use`/`import`). A name defined nowhere is dropped.
76    fn link(per_file: Vec<Vec<(String, HashSet<String>)>>) -> Self {
77        let file_count = per_file.len();
78        // name → the files that define it, for cross-file fallback resolution.
79        let mut defined_in: HashMap<&str, Vec<usize>> = HashMap::new();
80        for (fi, defs) in per_file.iter().enumerate() {
81            for (name, _) in defs {
82                defined_in.entry(name.as_str()).or_default().push(fi);
83            }
84        }
85        let mut edges: HashMap<Sym, HashSet<Sym>> = HashMap::new();
86        for (fi, defs) in per_file.iter().enumerate() {
87            let local: HashSet<&str> = defs.iter().map(|(n, _)| n.as_str()).collect();
88            for (name, refs) in defs {
89                let entry = edges.entry((fi, name.clone())).or_default();
90                for r in refs {
91                    if r == name {
92                        continue; // not a reference to itself
93                    }
94                    if local.contains(r.as_str()) {
95                        entry.insert((fi, r.clone())); // same-file definition wins
96                    } else if let Some(files) = defined_in.get(r.as_str()) {
97                        for &tf in files {
98                            entry.insert((tf, r.clone())); // cross-file fallback (by name)
99                        }
100                    }
101                }
102            }
103        }
104        SymbolGraph { edges, file_count }
105    }
106
107    /// The set of symbol **names** within `radius` reference-hops of `target` (inclusive of
108    /// `target` itself), across all files. `radius` 0 yields just the target. Names are collapsed at
109    /// the end, so this matches the prior name-based API for single-file callers and the
110    /// skeletoniser; traversal itself is file-scoped, so same-named symbols no longer over-link.
111    pub fn neighbors_within(&self, target: &str, radius: u8) -> HashSet<String> {
112        self.reach(target, radius)
113            .into_iter()
114            .map(|(_, name)| name)
115            .collect()
116    }
117
118    /// Like [`neighbors_within`](Self::neighbors_within), but the names to keep are returned **per
119    /// file** (index → names defined in that file within radius). Lets a multi-file skeletoniser keep
120    /// a body only in the file that actually owns the reached symbol, instead of keeping every
121    /// same-named body. `vec[i]` is the keep set for the `i`-th source passed to
122    /// [`from_sources`](Self::from_sources).
123    pub fn neighbors_within_by_file(&self, target: &str, radius: u8) -> Vec<HashSet<String>> {
124        let mut per_file = vec![HashSet::new(); self.file_count.max(1)];
125        for (fi, name) in self.reach(target, radius) {
126            if let Some(set) = per_file.get_mut(fi) {
127                set.insert(name);
128            }
129        }
130        per_file
131    }
132
133    /// BFS over the file-scoped edges from every symbol named `target`, returning the reached
134    /// `(file, name)` set (inclusive of the seeds).
135    fn reach(&self, target: &str, radius: u8) -> HashSet<Sym> {
136        let mut visited: HashSet<Sym> = HashSet::new();
137        let mut frontier: Vec<Sym> = Vec::new();
138        // Seed with every file that defines a symbol of this name (usually one).
139        for fi in 0..self.file_count.max(1) {
140            let sym = (fi, target.to_string());
141            if self.edges.contains_key(&sym) && visited.insert(sym.clone()) {
142                frontier.push(sym);
143            }
144        }
145        // A target with no out-edges (e.g. radius queried on an unknown name) still returns itself.
146        if visited.is_empty() {
147            visited.insert((0, target.to_string()));
148        }
149        for _ in 0..radius {
150            let mut next = Vec::new();
151            for node in &frontier {
152                if let Some(refs) = self.edges.get(node) {
153                    for r in refs {
154                        if visited.insert(r.clone()) {
155                            next.push(r.clone());
156                        }
157                    }
158                }
159            }
160            if next.is_empty() {
161                break;
162            }
163            frontier = next;
164        }
165        visited
166    }
167
168    /// All symbol names known to the graph (deduplicated across files).
169    pub fn names(&self) -> impl Iterator<Item = &str> {
170        self.edges
171            .keys()
172            .map(|(_, name)| name.as_str())
173            .collect::<HashSet<_>>()
174            .into_iter()
175    }
176}
177
178/// Every 1-based line on which `name` occurs as a **reference identifier** (a real
179/// `identifier`/`type_identifier` node), not inside a string or comment. Each matching line is
180/// returned once, with its trimmed text, in source order.
181///
182/// This is the AST-aware core of the `find_references` tool: unlike a substring `grep`, a mention
183/// of `name` in a string literal or comment does **not** match, so the call-site set it returns is
184/// the *real* code references. (It is name-keyed, like the rest of this module — it does not resolve
185/// imports/visibility, so it matches every same-named identifier; that is exactly what "find all
186/// usages of this name" wants.)
187///
188/// Returns `None` only when the source **fails to parse** — so a caller can distinguish that from a
189/// successful parse with no identifier matches (e.g. the name appears only in a comment), and avoid
190/// falling back to a substring scan that would re-introduce the comment/string false positives.
191pub fn find_identifier_lines(source: &str, lang: Lang, name: &str) -> Option<Vec<(usize, String)>> {
192    let tree = parse(source, lang)?;
193    let spec = lang.spec();
194    let mut rows: HashSet<usize> = HashSet::new();
195    collect_named_ref_rows(tree.root_node(), source, &spec, name, &mut rows);
196    let src_lines: Vec<&str> = source.lines().collect();
197    let mut rows: Vec<usize> = rows.into_iter().collect();
198    rows.sort_unstable();
199    Some(
200        rows.into_iter()
201            .map(|row| {
202                let text = src_lines
203                    .get(row)
204                    .map(|s| s.trim())
205                    .unwrap_or("")
206                    .to_string();
207                (row + 1, text)
208            })
209            .collect(),
210    )
211}
212
213/// Collect the (0-based) rows of every reference-identifier node whose text equals `name`.
214fn collect_named_ref_rows(
215    node: Node,
216    source: &str,
217    spec: &crate::lang::LangSpec,
218    name: &str,
219    out: &mut HashSet<usize>,
220) {
221    if spec.ref_ident_kinds.contains(&node.kind()) && text(node, source) == name {
222        out.insert(node.start_position().row);
223    }
224    let mut cursor = node.walk();
225    for child in node.children(&mut cursor) {
226        collect_named_ref_rows(child, source, spec, name, out);
227    }
228}
229
230/// Skeletonise `source`, keeping full bodies for symbols within `radius` hops of `target`.
231/// Everything else is elided. This is the knob-`N` entry point used by the budget loop and
232/// the `outline_file` tool's focus mode.
233pub fn skeleton_with_radius(source: &str, lang: Lang, target: &str, radius: u8) -> String {
234    let keep = SymbolGraph::from_source(source, lang).neighbors_within(target, radius);
235    skeleton::skeletonize(source, lang, &keep)
236}
237
238/// Walk the tree collecting `(name, referenced names)` for every symbol-kind node. References are
239/// resolved from the parse tree (identifier nodes), not raw text, and exclude the symbol's own
240/// locals — so names appearing in strings/comments, and locals shadowing a top-level symbol, no
241/// longer create spurious edges.
242fn collect_symbol_refs(source: &str, lang: Lang, out: &mut Vec<(String, HashSet<String>)>) {
243    let Some(tree) = parse(source, lang) else {
244        return;
245    };
246    let spec = lang.spec();
247    walk_symbols(tree.root_node(), source, &spec, out);
248}
249
250fn walk_symbols(
251    node: Node,
252    source: &str,
253    spec: &crate::lang::LangSpec,
254    out: &mut Vec<(String, HashSet<String>)>,
255) {
256    if spec.symbol_kinds.contains(&node.kind()) {
257        if let Some(name) = node.child_by_field_name("name").map(|n| text(n, source)) {
258            let mut bound = HashSet::new();
259            collect_bound(node, source, spec, &mut bound);
260            let mut refs = HashSet::new();
261            collect_refs(node, source, spec, &bound, &mut refs);
262            refs.remove(&name); // a symbol is not a reference to itself
263            out.push((name, refs));
264        }
265    }
266    let mut cursor = node.walk();
267    for child in node.children(&mut cursor) {
268        walk_symbols(child, source, spec, out);
269    }
270}
271
272/// Collect the identifiers bound *locally* inside `node`: for every binder (parameter, `let`,
273/// variable declarator) the identifiers in its binding position only (`pattern`/`name` field, or
274/// — for parameter-list nodes that have neither — its direct children). The binder's *value* and
275/// *type* are intentionally not treated as bindings, so a `let x = helper()` still references
276/// `helper` while binding `x`.
277fn collect_bound(
278    node: Node,
279    source: &str,
280    spec: &crate::lang::LangSpec,
281    out: &mut HashSet<String>,
282) {
283    if spec.binder_kinds.contains(&node.kind()) {
284        match node
285            .child_by_field_name("pattern")
286            .or_else(|| node.child_by_field_name("name"))
287        {
288            Some(target) => collect_idents(target, source, spec, out),
289            None => {
290                let mut cursor = node.walk();
291                for child in node.children(&mut cursor) {
292                    collect_idents(child, source, spec, out);
293                }
294            }
295        }
296    }
297    let mut cursor = node.walk();
298    for child in node.children(&mut cursor) {
299        collect_bound(child, source, spec, out);
300    }
301}
302
303/// Every reference-identifier name in `node`'s subtree that is not locally `bound`.
304fn collect_refs(
305    node: Node,
306    source: &str,
307    spec: &crate::lang::LangSpec,
308    bound: &HashSet<String>,
309    out: &mut HashSet<String>,
310) {
311    if spec.ref_ident_kinds.contains(&node.kind()) {
312        let t = text(node, source);
313        if !bound.contains(&t) {
314            out.insert(t);
315        }
316    }
317    let mut cursor = node.walk();
318    for child in node.children(&mut cursor) {
319        collect_refs(child, source, spec, bound, out);
320    }
321}
322
323/// Every reference-identifier name in `node`'s subtree (binding-agnostic — used to harvest the
324/// names a binder introduces).
325fn collect_idents(
326    node: Node,
327    source: &str,
328    spec: &crate::lang::LangSpec,
329    out: &mut HashSet<String>,
330) {
331    if spec.ref_ident_kinds.contains(&node.kind()) {
332        out.insert(text(node, source));
333    }
334    let mut cursor = node.walk();
335    for child in node.children(&mut cursor) {
336        collect_idents(child, source, spec, out);
337    }
338}
339
340fn text(node: Node, source: &str) -> String {
341    source[node.start_byte()..node.end_byte()].to_string()
342}
343
344#[cfg(test)]
345mod tests {
346    use super::*;
347
348    const SRC: &str = "\
349fn helper(x: i32) -> i32 {
350    x + 1
351}
352
353fn target() -> i32 {
354    helper(41)
355}
356
357fn unrelated() -> i32 {
358    7
359}
360";
361
362    #[test]
363    fn references_in_strings_and_comments_do_not_link() {
364        // `helper` appears only in a string literal and a line comment — never as a real call —
365        // so the name-based substring heuristic would wrongly link, but AST resolution does not.
366        let src = "\
367fn helper() -> i32 { 1 }
368fn target() -> i32 {
369    // call helper here later
370    let s = \"remember to call helper\";
371    2
372}
373";
374        let g = SymbolGraph::from_source(src, Lang::Rust);
375        assert!(
376            !g.neighbors_within("target", 1).contains("helper"),
377            "string/comment mention must not create an edge"
378        );
379    }
380
381    #[test]
382    fn a_local_shadowing_a_symbol_name_does_not_link() {
383        // `target` binds a local named `helper`; the top-level `helper` fn is never called.
384        let src = "\
385fn helper() -> i32 { 1 }
386fn target() -> i32 {
387    let helper = 5;
388    helper + 1
389}
390";
391        let g = SymbolGraph::from_source(src, Lang::Rust);
392        assert!(
393            !g.neighbors_within("target", 1).contains("helper"),
394            "a shadowing local must not link to the same-named symbol"
395        );
396    }
397
398    #[test]
399    fn graph_links_caller_to_callee() {
400        let g = SymbolGraph::from_source(SRC, Lang::Rust);
401        let within1 = g.neighbors_within("target", 1);
402        assert!(within1.contains("target"));
403        assert!(within1.contains("helper"));
404        assert!(!within1.contains("unrelated"));
405    }
406
407    #[test]
408    fn radius_zero_is_just_the_target() {
409        let g = SymbolGraph::from_source(SRC, Lang::Rust);
410        let within0 = g.neighbors_within("target", 0);
411        assert_eq!(within0.len(), 1);
412        assert!(within0.contains("target"));
413    }
414
415    #[test]
416    fn skeleton_with_radius_keeps_only_dependencies() {
417        // radius 1 keeps target + helper bodies, elides unrelated.
418        let out = skeleton_with_radius(SRC, Lang::Rust, "target", 1);
419        assert!(out.contains("helper(41)"), "target body kept: {out}");
420        assert!(out.contains("x + 1"), "helper body kept: {out}");
421        assert!(!out.contains("    7\n"), "unrelated body elided: {out}");
422    }
423
424    #[test]
425    fn radius_zero_elides_dependencies_too() {
426        let out = skeleton_with_radius(SRC, Lang::Rust, "target", 0);
427        assert!(out.contains("helper(41)"), "target body kept: {out}");
428        assert!(
429            !out.contains("x + 1"),
430            "helper body should be elided at radius 0: {out}"
431        );
432    }
433
434    #[test]
435    fn find_identifier_lines_matches_code_not_strings_or_comments() {
436        let src = "\
437fn helper() -> i32 { 1 }
438fn target() -> i32 {
439    // helper is mentioned in this comment
440    let s = \"call helper here\";
441    helper()
442}
443";
444        let hits = find_identifier_lines(src, Lang::Rust, "helper").unwrap();
445        let rows: Vec<usize> = hits.iter().map(|(r, _)| *r).collect();
446        // Line 1 (definition) and line 5 (the real call) — NOT the comment (3) or string (4).
447        assert_eq!(rows, vec![1, 5], "got {hits:?}");
448        assert!(hits.iter().any(|(r, t)| *r == 5 && t.contains("helper()")));
449    }
450
451    #[test]
452    fn find_identifier_lines_dedups_a_line_with_two_uses() {
453        let src = "fn f() -> i32 { g() + g() }\nfn g() -> i32 { 1 }\n";
454        let hits = find_identifier_lines(src, Lang::Rust, "g").unwrap();
455        // The `g() + g()` line is reported once, plus the definition line.
456        assert_eq!(hits.iter().map(|(r, _)| *r).collect::<Vec<_>>(), vec![1, 2]);
457    }
458
459    #[test]
460    fn multi_file_graph_links_across_sources() {
461        let a = "fn caller() -> i32 { shared() }";
462        let b = "fn shared() -> i32 { 5 }";
463        let g = SymbolGraph::from_sources([(Lang::Rust, a), (Lang::Rust, b)]);
464        assert!(g.neighbors_within("caller", 1).contains("shared"));
465    }
466
467    #[test]
468    fn same_name_across_files_resolves_to_the_local_definition() {
469        // Both files define `helper`. File A's `target` calls the local `helper` (a no-op body);
470        // file B's `helper` pulls in a dependency `dep`. A name-keyed graph would merge both
471        // `helper`s, so `target`'s radius-1 set would wrongly include B's `dep`. Scope-aware
472        // resolution links `target` → A's `helper` only.
473        let a = "\
474fn helper() -> i32 { 0 }
475fn target() -> i32 { helper() }
476";
477        let b = "\
478fn helper() -> i32 { dep() }
479fn dep() -> i32 { 9 }
480";
481        let g = SymbolGraph::from_sources([(Lang::Rust, a), (Lang::Rust, b)]);
482        let within2 = g.neighbors_within("target", 2);
483        assert!(
484            within2.contains("helper"),
485            "local helper linked: {within2:?}"
486        );
487        assert!(
488            !within2.contains("dep"),
489            "must not reach the OTHER file's helper dependency: {within2:?}"
490        );
491    }
492
493    #[test]
494    fn neighbors_by_file_keeps_a_body_only_in_its_owning_file() {
495        // `target` (file 0) → `repo` (file 1). The per-file keep sets must place `repo` in file 1,
496        // not file 0, so a skeletoniser keeps the body where it actually lives.
497        let a = "fn target() -> i32 { repo() }\nfn noise_a() -> i32 { 0 }";
498        let b = "fn repo() -> i32 { 5 }\nfn noise_b() -> i32 { 1 }";
499        let g = SymbolGraph::from_sources([(Lang::Rust, a), (Lang::Rust, b)]);
500        let per_file = g.neighbors_within_by_file("target", 1);
501        assert_eq!(per_file.len(), 2);
502        assert!(per_file[0].contains("target") && !per_file[0].contains("repo"));
503        assert!(per_file[1].contains("repo") && !per_file[1].contains("target"));
504    }
505}