ripvec_core/repo_map.rs
1//! `PageRank`-weighted structural overview of a codebase.
2//!
3//! Builds a dependency graph from tree-sitter definition and import extraction,
4//! ranks files by importance using `PageRank` (standard or topic-sensitive), and
5//! renders a budget-constrained overview with tiered detail levels.
6
7use std::collections::{HashMap, HashSet};
8use std::fmt::Write as _;
9use std::path::{Path, PathBuf};
10
11use rayon::prelude::*;
12use rkyv::{Archive, Deserialize as RkyvDeserialize, Serialize as RkyvSerialize};
13use streaming_iterator::StreamingIterator;
14use tree_sitter::{Parser, Query, QueryCursor};
15
16use serde::{Deserialize, Serialize};
17
18use crate::chunk::ContentKind;
19use crate::languages;
20use crate::walk;
21
22/// Serialize a `ContentKind` to a lowercase string tag for JSON output.
23fn content_kind_tag(ck: ContentKind) -> &'static str {
24 match ck {
25 ContentKind::Code => "code",
26 ContentKind::Docs => "docs",
27 ContentKind::Meta => "meta",
28 }
29}
30
31// ── Data Structures ──────────────────────────────────────────────────
32
33/// Persisted dependency graph with `PageRank` scores.
34#[derive(Debug, Clone, Archive, RkyvSerialize, RkyvDeserialize)]
35pub struct RepoGraph {
36 /// Files in the repository with definitions, imports, and calls.
37 pub files: Vec<FileNode>,
38 /// File-level edges (derived from def-level call edges).
39 pub edges: Vec<(u32, u32, u32)>,
40 /// File-level `PageRank` scores (aggregated from def-level).
41 pub base_ranks: Vec<f32>,
42 /// File-level callers (indices into `files`).
43 pub callers: Vec<Vec<u32>>,
44 /// File-level callees (indices into `files`).
45 pub callees: Vec<Vec<u32>>,
46 /// Definition-level call edges: `(caller_def, callee_def, weight)`.
47 pub def_edges: Vec<(DefId, DefId, u32)>,
48 /// Definition-level `PageRank` scores (flattened: `offsets[file_idx] + def_idx`).
49 pub def_ranks: Vec<f32>,
50 /// Definition-level callers (flattened, parallel to `def_ranks`).
51 pub def_callers: Vec<Vec<DefId>>,
52 /// Definition-level callees (flattened, parallel to `def_ranks`).
53 pub def_callees: Vec<Vec<DefId>>,
54 /// Prefix-sum offsets for flattening `DefId` to linear index.
55 pub def_offsets: Vec<usize>,
56 /// Auto-tuned alpha for search boost.
57 pub alpha: f32,
58}
59
60/// A file in the repository with its definitions and imports.
61#[derive(Debug, Clone, Archive, RkyvSerialize, RkyvDeserialize)]
62pub struct FileNode {
63 /// Relative path from the repository root.
64 pub path: String,
65 /// Definitions (functions, structs, classes, etc.) extracted from this file.
66 pub defs: Vec<Definition>,
67 /// Import references extracted from this file.
68 pub imports: Vec<ImportRef>,
69}
70
71/// A definition extracted from a source file.
72#[derive(Debug, Clone, Default, Archive, RkyvSerialize, RkyvDeserialize)]
73pub struct Definition {
74 /// Name of the definition (e.g., function name, class name).
75 pub name: String,
76 /// Kind of syntax node (e.g., `function_item`, `class_definition`).
77 pub kind: String,
78 /// 1-based start line number.
79 pub start_line: u32,
80 /// 1-based end line number.
81 pub end_line: u32,
82 /// Scope chain (e.g., `"impl_item Foo > fn bar"`).
83 pub scope: String,
84 /// Function/method signature, if available.
85 pub signature: Option<String>,
86 /// Byte offset of this definition's start in the source file.
87 pub start_byte: u32,
88 /// Byte offset of this definition's end in the source file.
89 pub end_byte: u32,
90 /// Call sites within this definition's body.
91 pub calls: Vec<CallRef>,
92 /// The first decorator name for Python `decorated_definition` nodes (e.g.,
93 /// `"property"`, `"classmethod"`, `"staticmethod"`, `"cached_property"`).
94 ///
95 /// `None` for all non-Python definitions and for bare (undecorated) Python
96 /// functions and classes. Populated by `extract_definitions` at AST-parse
97 /// time with full tree-sitter access (C1, 4.1.1).
98 pub decorator: Option<String>,
99 /// Decorator-aware LSP SymbolKind integer (e.g., 7=Property, 12=Function).
100 ///
101 /// Computed at parse time when the AST is available so projection sites
102 /// (`render_json_budgeted`) do not need to re-parse. For Python `@property`
103 /// or `@cached_property` → 7. For `@classmethod`, `@staticmethod`, or any
104 /// other decorator → 12. `None` for all non-decorated definitions; callers
105 /// fall back to `lsp_symbol_kind_for_node_kind(&self.kind)` when `None`
106 /// (C1/C2, 4.1.1).
107 pub lsp_kind_hint: Option<u32>,
108}
109
110/// An import reference extracted from a source file.
111#[derive(Debug, Clone, Archive, RkyvSerialize, RkyvDeserialize)]
112pub struct ImportRef {
113 /// Raw import path as written in source (e.g., `crate::foo::bar`).
114 pub raw_path: String,
115 /// Resolved file index in [`RepoGraph::files`], if resolution succeeded.
116 pub resolved_idx: Option<u32>,
117}
118
119/// Unique identifier for a definition: (file index, definition index within file).
120pub type DefId = (u32, u16);
121
122/// A call site extracted from a definition body.
123#[derive(Debug, Clone, Default, Archive, RkyvSerialize, RkyvDeserialize)]
124pub struct CallRef {
125 /// Callee function/method name (bare, without qualifier).
126 ///
127 /// For scoped calls like `mod_a::foo()`, this is `"foo"`.
128 /// For bare calls like `foo()`, this is `"foo"`.
129 pub name: String,
130 /// Full qualified path for scoped calls, e.g. `Some("mod_a::foo")`.
131 ///
132 /// `None` for bare (unqualified) calls. When `Some`, `resolve_calls`
133 /// uses this for qualifier-based module disambiguation before falling
134 /// back to the bare `name`.
135 pub qualified_path: Option<String>,
136 /// Receiver type for method calls, inferred from local context.
137 ///
138 /// Set to `Some("Foo")` when:
139 /// - The call is `self.method()` inside `impl Foo { … }`.
140 /// - The call is `x.method()` where `x` has an explicit type annotation `x: Foo`.
141 /// - The call is `x.method()` after `let x = Foo::new()`.
142 ///
143 /// `None` for free function calls, or when the receiver type cannot be
144 /// inferred from local context alone. When `Some`, `resolve_calls` prefers
145 /// defs whose enclosing impl scope matches the receiver type.
146 pub receiver_type: Option<String>,
147 /// Byte offset of the call in the source file (for scoping to definitions).
148 pub byte_offset: u32,
149 /// Resolved target definition, if resolution succeeded.
150 pub resolved: Option<DefId>,
151}
152
153// ── JSON output types ────────────────────────────────────────────────
154
155/// LSP-shaped location pointing at a file or symbol within a file.
156///
157/// Lines and characters are 0-based, matching the Language Server Protocol
158/// convention so callers can pass this directly to LSP tools without any
159/// conversion.
160#[derive(Debug, Clone, Serialize)]
161pub struct RepoMapLspLocation {
162 /// Relative path from the repository root (prefixed with `./`).
163 pub file_path: String,
164 /// 0-based start line.
165 pub start_line: usize,
166 /// 0-based start character (0 for file-level locations).
167 pub start_character: usize,
168 /// 0-based end line (equals `start_line` for file-level locations).
169 pub end_line: usize,
170 /// 0-based end character (0 for file-level locations).
171 pub end_character: usize,
172}
173
174/// A top-level symbol extracted from a file in the repository map.
175///
176/// Analogous to an LSP `DocumentSymbol` but limited to the fields available
177/// from tree-sitter definition extraction. The `rank` field carries the
178/// definition-level `PageRank` score from [`RepoGraph::def_ranks`], enabling
179/// callers to prioritise symbols by structural importance.
180#[derive(Debug, Clone, Serialize)]
181pub struct RepoMapSymbol {
182 /// Symbol name (function name, struct name, etc.).
183 pub name: String,
184 /// LSP `SymbolKind` as a decimal — use the same values as
185 /// `lsp_workspace_symbols` and `lsp_document_symbols`.
186 pub kind: u32,
187 /// Location pointing at the symbol's definition line (0-based).
188 pub lsp_location: RepoMapLspLocation,
189 /// Definition-level `PageRank` score from [`RepoGraph::def_ranks`].
190 ///
191 /// Higher values indicate definitions that are called by many other
192 /// definitions. Used by the token-budget allocator to decide which
193 /// symbols to include when the per-file budget is constrained.
194 pub rank: f32,
195}
196
197/// An outgoing call-edge from a file to another file.
198///
199/// Carries both the target file's `lsp_location` and its `base_rank`
200/// (file-level `PageRank` score) so callers can decide how important
201/// each dependency is without a separate lookup.
202#[derive(Debug, Clone, Serialize)]
203pub struct RepoMapCall {
204 /// Location pointing at the target file (line 0, character 0).
205 pub lsp_location: RepoMapLspLocation,
206 /// File-level `PageRank` score of the target file.
207 pub rank: f32,
208}
209
210/// One file entry in the JSON repo map.
211///
212/// Carries the file's `PageRank` score, content kind, outgoing call-edges to
213/// other files, and the file's top-level symbol definitions — all with
214/// `lsp_location` so the caller can chain directly into LSP tools without
215/// any destructuring.
216#[derive(Debug, Clone, Serialize)]
217pub struct RepoMapFile {
218 /// Location pointing at the file itself (line 0, character 0).
219 ///
220 /// Pass `lsp_location.file_path` directly into `lsp_document_symbols` or
221 /// any other file-scoped tool.
222 pub lsp_location: RepoMapLspLocation,
223 /// `PageRank` score in [0, 1] (higher = more structurally central).
224 pub rank: f32,
225 /// Content classification: `"code"`, `"docs"`, or `"meta"`.
226 ///
227 /// Serialized as a lowercase string tag so JSON consumers can branch
228 /// without numeric magic values. Mirrors the `ContentKind` enum in
229 /// `ripvec-core::chunk`.
230 pub content_kind: &'static str,
231 /// Outgoing call-edges sorted by target file `PageRank` descending.
232 pub calls: Vec<RepoMapCall>,
233 /// Top-level definitions extracted from this file by tree-sitter,
234 /// sorted by definition-level `PageRank` descending and pruned to
235 /// the per-file token-budget allocation.
236 pub symbols: Vec<RepoMapSymbol>,
237 /// Number of symbols that were omitted due to budget exhaustion or
238 /// logarithmic attenuation cutoff. `truncated_symbols + symbols.len()`
239 /// equals the total definition count for the file.
240 pub truncated_symbols: usize,
241 /// Number of call-edges that were omitted due to the `MAX_FILE_CALLS`
242 /// render cap or byte-budget exhaustion. `truncated_calls + calls.len()`
243 /// equals the total callee count for the file (I#68, 4.1.4).
244 pub truncated_calls: usize,
245}
246
247/// JSON-mode response envelope for `get_repo_map` (4.0.1 shape).
248///
249/// Replaces the `max_files`-capped shape from 4.0.0. The caller supplies a
250/// `token_budget`; files are allocated bytes proportional to their `PageRank`
251/// (40% cap per file, 200-byte envelope floor). Symbols are filled in
252/// def-rank order with a logarithmic attenuation cutoff. Leftover bytes
253/// cascade to subsequent files.
254///
255/// The `estimated_bytes`, `budget_bytes`, and `budget_exhausted` fields give
256/// callers real-time feedback on how tightly the budget was consumed.
257#[derive(Debug, Clone, Serialize)]
258pub struct GetRepoMapResponse {
259 /// Files sorted by `PageRank` descending, pruned to the token budget.
260 pub files: Vec<RepoMapFile>,
261 /// Total number of eligible files in the graph (pre-allocation).
262 ///
263 /// If `total_files > files.len()`, the budget ran out before all files
264 /// could be included. Read `budget_exhausted` directly for the boolean.
265 pub total_files: usize,
266 /// Actual serialised-JSON byte count for all returned content.
267 pub estimated_bytes: usize,
268 /// Budget ceiling in bytes that was used for allocation
269 /// (`token_budget * 4`).
270 pub budget_bytes: usize,
271 /// `true` when `total_files > files.len()` (budget was exhausted before
272 /// all eligible files were included).
273 pub budget_exhausted: bool,
274 /// Retained for backward compatibility with 4.0.0 callers that checked
275 /// `capped`. Equivalent to `budget_exhausted`.
276 pub capped: bool,
277}
278
279// ── Constants ────────────────────────────────────────────────────────
280
281/// `PageRank` damping factor.
282const DAMPING: f32 = 0.85;
283
284/// `PageRank` convergence threshold.
285const EPSILON: f32 = 1e-6;
286
287/// Maximum `PageRank` iterations.
288const MAX_ITERATIONS: usize = 100;
289
290/// Maximum callers/callees stored per file (display-oriented neighbor lists).
291///
292/// Raised from 5 → 25 in 4.1.3 (I#60): hub functions in real Rust/Python/Go
293/// corpora commonly have 10-25 outgoing call edges; the old cap of 5 caused
294/// every large-scale corpus file to report `truncated_calls ≥ 2` and left
295/// agents reading `get_repo_map.files[i].calls[]` with a sparse skeleton.
296///
297/// The serialisation-size cost is bounded by the existing `token_budget`
298/// allocator, which gates each file's call list against its byte allocation.
299/// BFS-internal reachability (`compute_dead_code`) uses the untruncated
300/// `def_edges` CSR directly (I#57/I#61) and is NOT affected by this constant.
301const MAX_NEIGHBORS: usize = 25;
302
303/// Maximum number of outgoing call entries rendered per file in the JSON response.
304///
305/// Applied at render time in [`render_json_budgeted`] as a hard cap on
306/// `calls[]` length (I#68, 4.1.4). Symmetric with the graph-build cap
307/// [`MAX_NEIGHBORS`] so agents always see up to 25 callees — matching the
308/// def-level cap introduced in I#60.
309///
310/// Replaces the logarithmic attenuation cutoff that was previously applied to
311/// file-level callees. Attenuation is appropriate for *symbol* lists (where
312/// rank distributions are informative) but pathological for *call-edge* lists:
313/// in real corpora callee base-ranks follow a geometric distribution, causing
314/// the attenuation to fire at pos=1 and collapse `calls[]` to a single entry.
315/// The byte-budget check is retained; this constant adds a count ceiling.
316const MAX_FILE_CALLS: usize = 25;
317
318/// Approximate characters per token for budget estimation.
319const CHARS_PER_TOKEN: usize = 4;
320
321/// Concentration mass placed on the focus node in topic-sensitive `PageRank`.
322///
323/// Following Haveliwala 2002 ("Topic-Sensitive PageRank"), the personalization
324/// vector places a bias `α` on the focus node and distributes the remaining
325/// `1 - α` uniformly over all other nodes. This preserves rank dispersion
326/// across the corpus — the user sees a *neighborhood* of related files
327/// rebiased toward the focus, not a Dirac delta on the focus node with
328/// every other file collapsed to an equally negligible uniform floor.
329///
330/// Value 0.35 means:
331/// - focus node teleportation probability = 0.35
332/// - each of the (n - 1) other nodes = 0.65 / (n - 1)
333///
334/// Calibration history:
335/// - Pre-4.0.5: α = 0.70 → winner-take-all collapse (flask focus = 0.703,
336/// all others ≈ 0.003); fixed under I#16.
337/// - 4.0.5 → 4.1.11: α = 0.15 → preserved dispersion but underbiased on
338/// real corpora: flask focus blueprints.py landed at position #5-#7
339/// instead of top-3 because structural hubs (helpers.py, app.py)
340/// dominated.
341/// - 4.1.12+: α = 0.35 → focus reliably surfaces in top-3 on flask
342/// (rank ~0.10 vs hub rank ~0.10), small-graph dispersion tests still
343/// pass (n=10 star dispersion ratio remains under the 40× ceiling),
344/// J2 file-count parity relaxed from 80% to 70% as the unavoidable
345/// trade-off for stronger focus bias.
346const PERSONALIZATION_ALPHA: f32 = 0.35;
347
348// ── Import Queries ───────────────────────────────────────────────────
349
350/// Compile a tree-sitter import query for the given extension.
351///
352/// Returns `None` for unsupported extensions.
353fn import_query_for_extension(ext: &str) -> Option<(tree_sitter::Language, Query)> {
354 let (lang, query_str): (tree_sitter::Language, &str) = match ext {
355 "rs" => (
356 tree_sitter_rust::LANGUAGE.into(),
357 "(use_declaration) @import",
358 ),
359 "py" | "pyi" => (
360 tree_sitter_python::LANGUAGE.into(),
361 concat!(
362 "(import_statement) @import\n",
363 "(import_from_statement) @import",
364 ),
365 ),
366 "js" | "jsx" => (
367 tree_sitter_javascript::LANGUAGE.into(),
368 "(import_statement source: (string) @import_path) @import",
369 ),
370 "ts" => (
371 tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into(),
372 "(import_statement source: (string) @import_path) @import",
373 ),
374 "tsx" => (
375 tree_sitter_typescript::LANGUAGE_TSX.into(),
376 "(import_statement source: (string) @import_path) @import",
377 ),
378 "go" => (
379 tree_sitter_go::LANGUAGE.into(),
380 "(import_spec path: (interpreted_string_literal) @import_path) @import",
381 ),
382 // Ruby: require statements.
383 "rb" => (
384 tree_sitter_ruby::LANGUAGE.into(),
385 "(call method: (identifier) @_method arguments: (argument_list (string (string_content) @import_path)) (#eq? @_method \"require\")) @import",
386 ),
387 _ => return None,
388 };
389 let query = match Query::new(&lang, query_str) {
390 Ok(q) => q,
391 Err(e) => {
392 tracing::warn!(ext, %e, "import query compilation failed — language may be ABI-incompatible");
393 return None;
394 }
395 };
396 Some((lang, query))
397}
398
399/// Extract import paths from source using tree-sitter.
400fn extract_imports(
401 source: &str,
402 lang: &tree_sitter::Language,
403 import_query: &Query,
404) -> Vec<String> {
405 let mut parser = Parser::new();
406 if parser.set_language(lang).is_err() {
407 return vec![];
408 }
409 let Some(tree) = parser.parse(source, None) else {
410 return vec![];
411 };
412
413 let mut cursor = QueryCursor::new();
414 let mut imports = Vec::new();
415 let mut matches = cursor.matches(import_query, tree.root_node(), source.as_bytes());
416
417 while let Some(m) = matches.next() {
418 // Prefer @import_path capture (JS/TS/Go), fall back to full @import text
419 let mut import_path_text = None;
420 let mut import_text = None;
421
422 for cap in m.captures {
423 let cap_name = &import_query.capture_names()[cap.index as usize];
424 let text = &source[cap.node.start_byte()..cap.node.end_byte()];
425 if *cap_name == "import_path" {
426 import_path_text = Some(text.trim_matches(|c| c == '"' || c == '\''));
427 } else if *cap_name == "import" {
428 import_text = Some(text);
429 }
430 }
431
432 if let Some(path) = import_path_text {
433 imports.push(path.to_string());
434 } else if let Some(text) = import_text {
435 imports.push(text.to_string());
436 }
437 }
438
439 imports
440}
441
442// ── Import Resolution ────────────────────────────────────────────────
443
444/// Resolve a Rust `use` path to a file index in the file map.
445///
446/// Handles `crate::`, `self::`, and `super::` prefixes. External crate
447/// imports are dropped (returns `None`).
448fn resolve_rust_import(
449 raw: &str,
450 file_path: &Path,
451 root: &Path,
452 file_index: &HashMap<PathBuf, usize>,
453) -> Option<usize> {
454 // Extract the module path from `use crate::foo::bar;` or `use crate::foo::bar::Baz;`
455 let trimmed = raw
456 .trim()
457 .trim_start_matches("use ")
458 .trim_end_matches(';')
459 .trim();
460
461 let segments: Vec<&str> = trimmed.split("::").collect();
462 if segments.is_empty() {
463 return None;
464 }
465
466 // Determine the base directory and skip prefix segments
467 let (base, skip) = match segments[0] {
468 "crate" => {
469 // Find the nearest Cargo.toml ancestor to determine the crate root.
470 // In a workspace, `crate::foo` resolves relative to the crate's src/,
471 // not the workspace root.
472 let mut dir = file_path.parent();
473 let crate_root = loop {
474 match dir {
475 Some(d) if d.join("Cargo.toml").exists() => break d.join("src"),
476 Some(d) => dir = d.parent(),
477 None => break root.join("src"), // fallback
478 }
479 };
480 (crate_root, 1)
481 }
482 "self" => {
483 let dir = file_path.parent()?;
484 (dir.to_path_buf(), 1)
485 }
486 "super" => {
487 let dir = file_path.parent()?.parent()?;
488 (dir.to_path_buf(), 1)
489 }
490 // External crate — drop
491 _ => return None,
492 };
493
494 // Build candidate paths from the remaining segments.
495 // Try progressively shorter prefixes since the last segments
496 // may be items (struct, fn) rather than modules.
497 let path_segments = &segments[skip..];
498 for end in (1..=path_segments.len()).rev() {
499 let mut candidate = base.clone();
500 for seg in &path_segments[..end] {
501 // Strip glob patterns like `{Foo, Bar}`
502 let clean = seg.split('{').next().unwrap_or(seg).trim();
503 if !clean.is_empty() {
504 candidate.push(clean);
505 }
506 }
507
508 // Try file.rs
509 let as_file = candidate.with_extension("rs");
510 if let Some(&idx) = file_index.get(&as_file) {
511 return Some(idx);
512 }
513
514 // Try dir/mod.rs
515 let as_mod = candidate.join("mod.rs");
516 if let Some(&idx) = file_index.get(&as_mod) {
517 return Some(idx);
518 }
519 }
520
521 None
522}
523
524/// Resolve an import path to a file index based on file extension.
525fn resolve_import(
526 raw: &str,
527 ext: &str,
528 file_path: &Path,
529 root: &Path,
530 file_index: &HashMap<PathBuf, usize>,
531) -> Option<usize> {
532 match ext {
533 "rs" => resolve_rust_import(raw, file_path, root, file_index),
534 "py" | "pyi" => resolve_python_import(raw, root, file_index),
535 "js" | "jsx" | "ts" | "tsx" => resolve_js_import(raw, file_path, file_index),
536 // Go imports use full package paths — skip local resolution
537 _ => None,
538 }
539}
540
541/// Resolve a Python import to a file index.
542///
543/// Handles `import foo.bar` and `from foo.bar import baz` patterns.
544fn resolve_python_import(
545 raw: &str,
546 root: &Path,
547 file_index: &HashMap<PathBuf, usize>,
548) -> Option<usize> {
549 let module_path = if let Some(rest) = raw.strip_prefix("from ") {
550 rest.split_whitespace().next()?
551 } else if let Some(rest) = raw.strip_prefix("import ") {
552 rest.split_whitespace().next()?
553 } else {
554 return None;
555 };
556
557 let rel_path: PathBuf = module_path.split('.').collect();
558 for ext in ["py", "pyi"] {
559 let as_file = root.join(&rel_path).with_extension(ext);
560 if let Some(&idx) = file_index.get(&as_file) {
561 return Some(idx);
562 }
563 }
564
565 for init_name in ["__init__.py", "__init__.pyi"] {
566 let as_init = root.join(&rel_path).join(init_name);
567 if let Some(&idx) = file_index.get(&as_init) {
568 return Some(idx);
569 }
570 }
571
572 None
573}
574
575/// Resolve a JS/TS import to a file index.
576///
577/// Handles relative paths like `./foo` or `../bar`.
578fn resolve_js_import(
579 raw: &str,
580 file_path: &Path,
581 file_index: &HashMap<PathBuf, usize>,
582) -> Option<usize> {
583 if !raw.starts_with('.') {
584 return None;
585 }
586
587 let dir = file_path.parent()?;
588 let candidate = dir.join(raw);
589
590 for ext in &["js", "jsx", "ts", "tsx"] {
591 let with_ext = candidate.with_extension(ext);
592 if let Some(&idx) = file_index.get(&with_ext) {
593 return Some(idx);
594 }
595 }
596
597 for ext in &["js", "jsx", "ts", "tsx"] {
598 let index_file = candidate.join("index").with_extension(ext);
599 if let Some(&idx) = file_index.get(&index_file) {
600 return Some(idx);
601 }
602 }
603
604 None
605}
606
607// ── Extraction ───────────────────────────────────────────────────────
608
609/// Extract the name of the first decorator from a `decorated_definition` node.
610///
611/// Mirrors the private `languages::first_decorator_ident` but inlined here
612/// because that function is private to the `languages` module.
613///
614/// For simple `@name` decorators (e.g., `@property`, `@classmethod`), returns
615/// `Some("property")` or `Some("classmethod")`.
616///
617/// For attribute-access decorators (e.g., `@functools.lru_cache`), returns
618/// `Some("functools.lru_cache")` — the full dotted-name text.
619///
620/// For call-expression decorators (e.g., `@app.route("/")`), returns `None`.
621fn extract_first_decorator_name(node: &tree_sitter::Node<'_>, source: &[u8]) -> Option<String> {
622 let mut cursor = node.walk();
623 for child in node.children(&mut cursor) {
624 if child.kind() == "decorator" {
625 let mut inner = child.walk();
626 for inner_child in child.children(&mut inner) {
627 match inner_child.kind() {
628 // Simple name (@property) or attribute access (@functools.lru_cache) —
629 // return the full text so callers can store it for display.
630 "identifier" | "attribute" => {
631 return std::str::from_utf8(
632 &source[inner_child.start_byte()..inner_child.end_byte()],
633 )
634 .ok()
635 .map(str::to_owned);
636 }
637 // Call expression — ambiguous; treat as None.
638 "call" => return None,
639 _ => {}
640 }
641 }
642 return None;
643 }
644 }
645 None
646}
647
648/// Determine whether a Python `(assignment)` or JS/TS `(variable_declarator)`
649/// def captured by the def-query is spuriously nested inside a function body.
650///
651/// Cycle 10 W1 Front A — the Python def-query at `languages.rs:637` and the
652/// JS/TS variants at `:647`, `:657`, `:669` capture every `x = foo()` or
653/// `const x = foo()` site, even those inside function bodies, as defs. The
654/// `extract_calls` smallest-enclosing rule (`repo_map.rs:827-832`) then steals
655/// every call out of the surrounding function into the nested assignment def.
656/// Result: BFS terminates early because function defs have no outgoing edges.
657///
658/// Fix: walk up the AST from the def node. If we hit a function-body context
659/// (Python `function_definition`, JS/TS `function_declaration`,
660/// `function_expression`, `arrow_function`, `method_definition`,
661/// `generator_function_declaration`, `generator_function`) before reaching the
662/// module root, the def is spurious and must be dropped.
663///
664/// Module-level constants (`MAX = 4096`) and class-attribute assignments
665/// (`class Foo: bar = make_bar()`) — both of which are legitimate defs — are
666/// preserved because their ancestor chain contains only `module` /
667/// `class_definition` / `class_body` / `block` / `program` nodes, never a
668/// function-body container.
669fn is_spurious_nested_binding_def(kind: &str, node: tree_sitter::Node<'_>) -> bool {
670 // Only the assignment / variable_declarator captures are at risk.
671 // Other def kinds (function_definition, class_definition, method_definition,
672 // function_declaration, class_declaration, type_alias_declaration, etc.)
673 // are always legitimate at any nesting depth (e.g. nested functions,
674 // methods inside classes).
675 if !matches!(kind, "assignment" | "variable_declarator") {
676 return false;
677 }
678 let mut cur = node.parent();
679 while let Some(parent) = cur {
680 match parent.kind() {
681 // Function-body containers: any def captured beneath one of these
682 // is a local-variable binding, not a module/class-level def.
683 "function_definition" // Python
684 | "function_declaration" // JS / TS
685 | "function_expression" // JS / TS
686 | "arrow_function" // JS / TS
687 | "method_definition" // JS / TS
688 | "generator_function_declaration"
689 | "generator_function" => return true,
690 // Top-level containers — we reached the module without finding a
691 // function ancestor. The def is legitimate.
692 "module" | "program" => return false,
693 _ => {}
694 }
695 cur = parent.parent();
696 }
697 false
698}
699
700/// Extract definitions from a source file using tree-sitter.
701fn extract_definitions(source: &str, config: &languages::LangConfig) -> Vec<Definition> {
702 let mut parser = Parser::new();
703 if parser.set_language(&config.language).is_err() {
704 return vec![];
705 }
706 let Some(tree) = parser.parse(source, None) else {
707 return vec![];
708 };
709
710 let mut cursor = QueryCursor::new();
711 let mut defs = Vec::new();
712 let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
713
714 while let Some(m) = matches.next() {
715 let mut name = String::new();
716 let mut def_node = None;
717
718 for cap in m.captures {
719 let cap_name = &config.query.capture_names()[cap.index as usize];
720 if *cap_name == "name" {
721 name = source[cap.node.start_byte()..cap.node.end_byte()].to_string();
722 } else if *cap_name == "def" {
723 def_node = Some(cap.node);
724 }
725 }
726
727 if let Some(node) = def_node {
728 // Cycle 10 W1 Front A: drop spurious nested assignment / variable_declarator
729 // defs that would steal calls from the enclosing function.
730 if is_spurious_nested_binding_def(node.kind(), node) {
731 continue;
732 }
733 let scope = crate::chunk::build_scope_chain(node, source);
734 let signature = crate::chunk::extract_signature(node, source);
735 #[expect(clippy::cast_possible_truncation, reason = "line numbers fit in u32")]
736 let start_line = node.start_position().row as u32 + 1;
737 #[expect(clippy::cast_possible_truncation, reason = "line numbers fit in u32")]
738 let end_line = node.end_position().row as u32 + 1;
739 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
740 let start_byte = node.start_byte() as u32;
741 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
742 let end_byte = node.end_byte() as u32;
743 // C1 (4.1.1): For Python `decorated_definition` nodes, extract the
744 // decorator name and compute the LSP kind at AST-parse time so the
745 // projection site in `render_json_budgeted` does not need to re-parse.
746 let (decorator, lsp_kind_hint) = if node.kind() == "decorated_definition" {
747 let dec = extract_first_decorator_name(&node, source.as_bytes());
748 let kind_hint = languages::lsp_symbol_kind_for_decorated_definition(
749 dec.as_deref().unwrap_or(""),
750 );
751 (dec, Some(kind_hint))
752 } else {
753 (None, None)
754 };
755 defs.push(Definition {
756 name,
757 kind: node.kind().to_string(),
758 start_line,
759 end_line,
760 scope,
761 signature,
762 start_byte,
763 end_byte,
764 calls: vec![],
765 decorator,
766 lsp_kind_hint,
767 });
768 }
769 }
770
771 defs
772}
773
774// ── Call Extraction & Resolution ────────────────────────────────────
775
776/// Tiebreak priority for def attribution when two defs share the same byte span.
777///
778/// Returns `0` for function-like defs (lowest value = wins in `min_by_key`) and
779/// `1` for structural container defs (class bodies, impl blocks, etc.).
780///
781/// This resolves the Python case where the class body `block` and the first
782/// `function_definition` inside it occupy identical byte ranges; calls inside
783/// the function body should be attributed to the function, not the class block.
784fn is_callable_def_priority(kind: &str) -> u8 {
785 match kind {
786 // Function / method defs: these are the correct attribution targets.
787 "function_item"
788 | "function_definition"
789 | "function_declaration"
790 | "function_signature_item"
791 | "method_definition"
792 | "method_declaration"
793 | "method" => 0,
794 // Structural containers: class body blocks, impl items, etc.
795 // Prefer function-like defs over these when byte ranges tie.
796 _ => 1,
797 }
798}
799
800// ── I#77: Python import aliases ─────────────────────────────────────────
801//
802// `from X import Y as Z` and `import X as Y` rebind the canonical name to a
803// local identifier. Without alias-rewriting, the call-edge extractor records
804// `Z()` and `Y.fn()` under the alias — names that have no entry in the
805// global `def_index` — and the canonical target's `def_callers` stays empty.
806// Same NC11 closure-attribution failure class as I#57 (Rust closures) and
807// I#71 (JS closures).
808//
809// Aliases are scope-limited: a `from X import Y as Z` inside `def use()`
810// must not bind `Z` for sibling functions. The extractor records each alias
811// with the byte range of its enclosing scope (module = whole file, or the
812// `function_definition` body). The smallest enclosing alias wins at the
813// call site (same shadowing rule as Python's runtime).
814
815/// One Python import alias (`from X import Y as Z` or `import X as Y`).
816///
817/// The alias name `local` is what appears at the call site. Resolution
818/// rewrites the call to the canonical form recorded in `canonical_module`
819/// and (for `from`-imports) `canonical_name`.
820#[derive(Debug, Clone)]
821struct PythonAlias {
822 /// Local identifier at the call site (the right-hand side of `as`).
823 local: String,
824 /// Canonical module path (the dotted module name in the import).
825 canonical_module: String,
826 /// Canonical attribute name within the module (`Y` from `from X import Y as Z`).
827 ///
828 /// `None` for `import X as Y` — the alias rebinds the module itself,
829 /// so `Y.fn()` resolves through `canonical_module::fn` at the receiver,
830 /// not by stripping the attribute.
831 canonical_name: Option<String>,
832 /// Scope start byte (inclusive) — first byte of the enclosing scope.
833 scope_start: u32,
834 /// Scope end byte (exclusive) — last byte of the enclosing scope.
835 scope_end: u32,
836}
837
838/// Walk a Python AST and collect every `import X as Y` and
839/// `from X import Y as Z` alias, recording each with the byte range of its
840/// enclosing scope (whole file at module level, `function_definition` body
841/// for function-local imports).
842///
843/// The returned list is sorted by `(scope_end - scope_start)` ascending —
844/// the smallest enclosing scope at any byte offset is the first match for a
845/// given `local` name. (Python's import shadowing rule: an inner-scope
846/// `import X as Y` shadows any outer binding of `Y`.)
847fn extract_python_aliases(source: &str, root: tree_sitter::Node<'_>) -> Vec<PythonAlias> {
848 let mut out: Vec<PythonAlias> = Vec::new();
849 collect_python_aliases_rec(source, root, &mut out);
850 // Smallest scope first so the per-call linear search finds the
851 // narrowest binding before a wider one with the same `local` name.
852 out.sort_by_key(|a| a.scope_end.saturating_sub(a.scope_start));
853 out
854}
855
856/// Recursive helper for [`extract_python_aliases`]. Visits every node and
857/// records aliases on `import_statement` / `import_from_statement`.
858fn collect_python_aliases_rec(
859 source: &str,
860 node: tree_sitter::Node<'_>,
861 out: &mut Vec<PythonAlias>,
862) {
863 match node.kind() {
864 "import_statement" => collect_aliases_import_stmt(source, node, out),
865 "import_from_statement" => collect_aliases_import_from_stmt(source, node, out),
866 _ => {}
867 }
868 let mut cursor = node.walk();
869 for child in node.children(&mut cursor) {
870 collect_python_aliases_rec(source, child, out);
871 }
872}
873
874/// Extract aliases from an `import_statement` node — patterns like
875/// `import X`, `import X.Y`, `import X as Y`, `import X.Y as Z`.
876///
877/// Only `aliased_import` children produce an alias; bare imports are ignored
878/// because the call site already uses the canonical name.
879fn collect_aliases_import_stmt(
880 source: &str,
881 node: tree_sitter::Node<'_>,
882 out: &mut Vec<PythonAlias>,
883) {
884 let scope = enclosing_python_scope(node);
885 let mut cursor = node.walk();
886 for child in node.children(&mut cursor) {
887 if child.kind() != "aliased_import" {
888 continue;
889 }
890 let (Some(name_node), Some(alias_node)) = (
891 child.child_by_field_name("name"),
892 child.child_by_field_name("alias"),
893 ) else {
894 continue;
895 };
896 let canonical_module = source[name_node.start_byte()..name_node.end_byte()].to_string();
897 let local = source[alias_node.start_byte()..alias_node.end_byte()].to_string();
898 out.push(PythonAlias {
899 local,
900 canonical_module,
901 canonical_name: None,
902 scope_start: scope.0,
903 scope_end: scope.1,
904 });
905 }
906}
907
908/// Extract aliases from an `import_from_statement` node — patterns like
909/// `from X import Y`, `from X import Y as Z`, `from X import (Y as Z, W)`.
910///
911/// Only `aliased_import` children produce an alias; bare `from X import Y`
912/// is handled by the existing imported-file resolver (Priority 4) since `Y`
913/// already matches the canonical def name in `X`.
914fn collect_aliases_import_from_stmt(
915 source: &str,
916 node: tree_sitter::Node<'_>,
917 out: &mut Vec<PythonAlias>,
918) {
919 let Some(module_node) = node.child_by_field_name("module_name") else {
920 return;
921 };
922 let canonical_module = source[module_node.start_byte()..module_node.end_byte()].to_string();
923 let scope = enclosing_python_scope(node);
924 let mut cursor = node.walk();
925 for child in node.children(&mut cursor) {
926 if child.kind() != "aliased_import" {
927 continue;
928 }
929 let (Some(name_node), Some(alias_node)) = (
930 child.child_by_field_name("name"),
931 child.child_by_field_name("alias"),
932 ) else {
933 continue;
934 };
935 let canonical_name = source[name_node.start_byte()..name_node.end_byte()].to_string();
936 let local = source[alias_node.start_byte()..alias_node.end_byte()].to_string();
937 out.push(PythonAlias {
938 local,
939 canonical_module: canonical_module.clone(),
940 canonical_name: Some(canonical_name),
941 scope_start: scope.0,
942 scope_end: scope.1,
943 });
944 }
945}
946
947/// Compute the byte range of the smallest enclosing Python scope for an
948/// import statement.
949///
950/// Walks up from `node` until we find a `function_definition` (function-local
951/// import) or run out of ancestors (module-level import, scope = whole file).
952/// Class-level imports are treated as module-level for this purpose: Python
953/// class bodies do not introduce a true lexical scope for nested function
954/// lookups, so the simplest correct rule is "function-local or wider".
955fn enclosing_python_scope(node: tree_sitter::Node<'_>) -> (u32, u32) {
956 let mut cur = node.parent();
957 while let Some(parent) = cur {
958 if parent.kind() == "function_definition" {
959 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
960 return (parent.start_byte() as u32, parent.end_byte() as u32);
961 }
962 cur = parent.parent();
963 }
964 // Reached the module root — alias is module-level. Use the root node's
965 // range (effectively the whole file).
966 let mut root = node;
967 while let Some(parent) = root.parent() {
968 root = parent;
969 }
970 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
971 {
972 (root.start_byte() as u32, root.end_byte() as u32)
973 }
974}
975
976/// Find the smallest enclosing alias for a given byte offset and local name.
977///
978/// Returns the first alias (in sorted-by-scope-size order) whose `local`
979/// equals `local_name` and whose scope contains `byte_offset`. Smallest-
980/// scope-first ordering implements Python's shadowing rule: a function-local
981/// `import X as Y` shadows a module-level `import Z as Y`.
982fn lookup_python_alias<'a>(
983 aliases: &'a [PythonAlias],
984 local_name: &str,
985 byte_offset: u32,
986) -> Option<&'a PythonAlias> {
987 aliases.iter().find(|a| {
988 a.local == local_name && a.scope_start <= byte_offset && byte_offset < a.scope_end
989 })
990}
991
992/// Rewrite a Python call's `name` and `qualified_path` through the alias map
993/// in place. Used inside [`extract_calls`] so the resolver sees the canonical
994/// target identifier and qualifier prefix instead of the local alias.
995///
996/// Two patterns are recognised, mirroring the two `aliased_import` shapes:
997///
998/// 1. **`from X import Y as Z; Z(...)`** — `name` is the bare identifier
999/// matching alias `local = Z`. Rewrite `name = Y`, set
1000/// `qualified_path = Some("X::Y")` so Priority 1 (qualified-path)
1001/// fires on the resolver.
1002/// 2. **`import X as Y; Y.fn(...)`** — callee node sits inside an
1003/// `(attribute attribute: (identifier) @callee)` shape. The
1004/// sibling `object:` child is the receiver. If the receiver is a bare
1005/// identifier matching alias `local = Y` (with `canonical_name = None`,
1006/// i.e. an `import`-style rather than `from`-style alias), keep `name`
1007/// as the attribute and set `qualified_path = Some("X::fn")`.
1008///
1009/// A function-local alias shadows a module-level alias of the same name
1010/// because [`extract_python_aliases`] sorts smallest-scope-first.
1011fn rewrite_python_call_via_alias(
1012 source: &str,
1013 aliases: &[PythonAlias],
1014 callee_node: tree_sitter::Node<'_>,
1015 call_byte: u32,
1016 name: &mut String,
1017 qualified_path: &mut Option<String>,
1018) {
1019 // Pattern 1: bare call. The callee node's parent is the `call`
1020 // itself (`(call function: (identifier) @callee)`). If the parent's
1021 // `function:` field is the identifier (not an `attribute`), this is
1022 // a bare call.
1023 let is_bare_call = callee_node
1024 .parent()
1025 .filter(|p| p.kind() == "call")
1026 .and_then(|p| p.child_by_field_name("function"))
1027 .is_some_and(|fn_node| fn_node.id() == callee_node.id());
1028
1029 if is_bare_call && let Some(alias) = lookup_python_alias(aliases, name.as_str(), call_byte) {
1030 // `from X import Y as Z`: rewrite to canonical `Y` with qualifier `X::Y`.
1031 // `import X as Z` called as `Z(...)`: less common but legal; the
1032 // canonical call is `X(...)` and the only sensible qualifier is `X`.
1033 let canonical = alias
1034 .canonical_name
1035 .clone()
1036 .unwrap_or_else(|| alias.canonical_module.clone());
1037 let qpath = match &alias.canonical_name {
1038 Some(cn) => format!("{}::{cn}", alias.canonical_module),
1039 None => alias.canonical_module.clone(),
1040 };
1041 *name = canonical;
1042 *qualified_path = Some(qpath);
1043 return;
1044 }
1045
1046 // Pattern 2: attribute call `receiver.fn()`. The callee node's
1047 // grandparent (`(call function: (attribute attribute: (identifier) @callee)) @call`)
1048 // gives us the attribute node, whose `object` field is the receiver.
1049 //
1050 // We only rewrite when the receiver is a bare identifier matching an
1051 // `import X as Y`-style alias (canonical_name = None). For a
1052 // `from X import Mod as M`-style alias on a receiver, the canonical
1053 // call would be `Mod.fn()` — but `Mod` is an attribute of `X`, not a
1054 // module path we can resolve, so we leave it alone.
1055 let Some(attr_node) = callee_node.parent().filter(|p| p.kind() == "attribute") else {
1056 return;
1057 };
1058 let Some(object_node) = attr_node.child_by_field_name("object") else {
1059 return;
1060 };
1061 if object_node.kind() != "identifier" {
1062 // `a.b.c()` — receiver is itself an attribute. Not a single-alias
1063 // rewrite; leave as-is for the resolver to handle.
1064 return;
1065 }
1066 let receiver = &source[object_node.start_byte()..object_node.end_byte()];
1067 let Some(alias) = lookup_python_alias(aliases, receiver, call_byte) else {
1068 return;
1069 };
1070 if alias.canonical_name.is_some() {
1071 // The alias rebinds a name, not a module — receiver is `M` from
1072 // `from X import Mod as M`. Canonical `Mod.fn()` isn't a path we
1073 // can construct without knowing whether `Mod` is a class or
1074 // submodule. Leave unresolved.
1075 return;
1076 }
1077 // `import X as receiver`: rewrite to qualifier `X::name`. The bare
1078 // `name` (the attribute identifier) is preserved — it is already the
1079 // function name in module `X`.
1080 *qualified_path = Some(format!("{}::{name}", alias.canonical_module));
1081}
1082
1083/// Extract call sites from a source file and assign them to definitions.
1084///
1085/// Uses the language's call query to find all call expressions, then
1086/// assigns each call to the definition whose byte range contains it.
1087/// Calls outside any definition body (module-level) are ignored.
1088///
1089/// For Rust scoped calls (`a::b::foo()`), the `@callee` capture returns the
1090/// full `scoped_identifier` node. This function splits it into:
1091/// - `name` = bare trailing identifier (`"foo"`)
1092/// - `qualified_path` = `Some("a::b::foo")` for disambiguation in `resolve_calls`.
1093///
1094/// For method calls (`x.method()`), `receiver_type` is inferred from local
1095/// context (parameter annotations, let-bindings, impl blocks). See
1096/// [`infer_receiver_types`] for the heuristic.
1097fn extract_calls(source: &str, call_config: &languages::CallConfig, defs: &mut [Definition]) {
1098 let mut parser = Parser::new();
1099 if parser.set_language(&call_config.language).is_err() {
1100 return;
1101 }
1102 let Some(tree) = parser.parse(source, None) else {
1103 return;
1104 };
1105
1106 // Build receiver-type map: byte_offset_of_call → receiver_type_string.
1107 // Done once per file to amortise the tree walk cost.
1108 let receiver_map = infer_receiver_types(source, &tree, &call_config.language);
1109
1110 // HCL: run the HCL-specific call-edge extractor as a post-pass so the
1111 // terraform_remote_state references and module blocks contribute edges
1112 // that the generic function_call query cannot capture (R2 + R3, Wave 3).
1113 if languages::is_hcl_language(&call_config.language) {
1114 extract_hcl_call_edges(source, tree.root_node(), defs);
1115 }
1116
1117 // C / C++ (I#55, 4.1.5): emit synthetic call-graph edges from
1118 // struct-literal initializer fnptrs. C codebases dispatch via tables of
1119 // function pointers (Linux `file_operations`, redis `redisCommandTable`,
1120 // libuv handle vtables) — the generic call-expression query cannot
1121 // capture these because there is no syntactic `f()` call site; the
1122 // function is referenced by bare identifier inside `{ ... }`. Without
1123 // this post-pass, every implementation referenced exclusively via such
1124 // a table appears dead (Part XI §XI.4: kernel mega-cluster collapse on
1125 // Linux, command-implementation collapse on redis).
1126 if languages::is_c_language(&call_config.language)
1127 || languages::is_cpp_language(&call_config.language)
1128 {
1129 extract_c_struct_init_edges(source, tree.root_node(), defs);
1130 }
1131
1132 // I#77: Python import-alias map. For Python source, walk the AST once
1133 // and collect every `import X as Y` / `from X import Y as Z` binding,
1134 // tagged with its enclosing scope's byte range. The per-call lookup
1135 // below rewrites `Y(...)` and `Y.fn(...)` to their canonical targets
1136 // before the resolver runs.
1137 let python_aliases: Vec<PythonAlias> = if languages::is_python_language(&call_config.language) {
1138 extract_python_aliases(source, tree.root_node())
1139 } else {
1140 Vec::new()
1141 };
1142
1143 let mut cursor = QueryCursor::new();
1144 let mut matches = cursor.matches(&call_config.query, tree.root_node(), source.as_bytes());
1145
1146 while let Some(m) = matches.next() {
1147 let mut full_callee_text = None;
1148 let mut call_byte = 0u32;
1149 let mut callee_node: Option<tree_sitter::Node<'_>> = None;
1150
1151 for cap in m.captures {
1152 let cap_name = &call_config.query.capture_names()[cap.index as usize];
1153 if *cap_name == "callee" {
1154 full_callee_text =
1155 Some(source[cap.node.start_byte()..cap.node.end_byte()].to_string());
1156 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
1157 {
1158 call_byte = cap.node.start_byte() as u32;
1159 }
1160 callee_node = Some(cap.node);
1161 }
1162 }
1163
1164 if let Some(full_text) = full_callee_text {
1165 // Split qualified path into bare name + optional qualifier.
1166 let (mut name, mut qualified_path) = if full_text.contains("::") {
1167 let bare = full_text
1168 .rsplit("::")
1169 .next()
1170 .unwrap_or(&full_text)
1171 .to_string();
1172 (bare, Some(full_text))
1173 } else {
1174 (full_text, None)
1175 };
1176
1177 // I#77: Python alias rewriting. Two cases:
1178 //
1179 // 1. Bare call `baz()` where `from X import Y as baz`:
1180 // rewrite `name` to canonical `Y` and set
1181 // `qualified_path = Some("X::Y")` so Priority 1 fires.
1182 //
1183 // 2. Attribute call `m2.fn()` where `import X as m2`:
1184 // the callee text is `fn`, but the receiver `m2` is an
1185 // alias for module `X`. Walk to the `attribute object`
1186 // node to extract the receiver identifier, look it up,
1187 // and set `qualified_path = Some("X::fn")` so Priority 1
1188 // fires.
1189 if !python_aliases.is_empty()
1190 && let Some(callee) = callee_node
1191 {
1192 rewrite_python_call_via_alias(
1193 source,
1194 &python_aliases,
1195 callee,
1196 call_byte,
1197 &mut name,
1198 &mut qualified_path,
1199 );
1200 }
1201
1202 // Look up receiver type from the pre-built map.
1203 let receiver_type = receiver_map.get(&call_byte).cloned();
1204
1205 // Assign to the most-specific (smallest byte range) enclosing definition.
1206 // Using `find` (first match) was incorrect for nested defs: an `impl_item`
1207 // wrapping a `function_item` both contain the call site, but the
1208 // `function_item` is the correct granularity for method attribution.
1209 //
1210 // Tiebreak: when two defs have equal byte spans (as happens in Python where
1211 // the class body `block` and its first `function_definition` share the same
1212 // start/end bytes), prefer function-like defs over structural container defs.
1213 // `is_callable_def` returns 0 for function-like kinds (sorts first in min_by_key).
1214 let enclosing_idx = defs
1215 .iter()
1216 .enumerate()
1217 .filter(|(_, d)| d.start_byte <= call_byte && call_byte < d.end_byte)
1218 .min_by_key(|(_, d)| (d.end_byte - d.start_byte, is_callable_def_priority(&d.kind)))
1219 .map(|(i, _)| i);
1220
1221 if let Some(idx) = enclosing_idx {
1222 // Skip self-recursive calls (compare bare name to def name).
1223 if defs[idx].name != name {
1224 defs[idx].calls.push(CallRef {
1225 name,
1226 qualified_path,
1227 receiver_type,
1228 byte_offset: call_byte,
1229 resolved: None,
1230 });
1231 }
1232 }
1233 // Calls outside any definition are ignored (module-level init).
1234 }
1235 }
1236
1237 // JS / TS / TSX (B-0005): attribute closure-argument call edges to the
1238 // nearest named enclosing function. Arrow functions and function
1239 // expressions passed as arguments (e.g. `useCallback(() => fn(), [deps])`
1240 // or `setTimeout(() => fn(), 0)`) are "passthrough" scopes — their inner
1241 // calls bubble up to the nearest `function_declaration` or
1242 // `method_definition` ancestor. Named const-assigned arrows (`const f =
1243 // () => ...`) are NOT passthrough; they are their own def targets.
1244 //
1245 // This runs as a post-pass so that the main-loop dedup check for existing
1246 // (byte_offset, name) pairs can prevent duplicates from being emitted when
1247 // a call is already attributed to the correct named function.
1248 if is_js_or_ts_language(&call_config.language) {
1249 extract_js_closure_call_edges(source, tree.root_node(), defs);
1250 }
1251}
1252
1253// ── JS / TS closure call-edge attribution ────────────────────────────────────
1254
1255/// Returns `true` if `lang` is one of the JavaScript / TypeScript grammars.
1256///
1257/// Uses the same node-kind-count proxy as [`languages::is_rust_language`].
1258/// JS, TS, and TSX all need closure attribution (B-0005).
1259fn is_js_or_ts_language(lang: &tree_sitter::Language) -> bool {
1260 let js_lang: tree_sitter::Language = tree_sitter_javascript::LANGUAGE.into();
1261 let ts_lang: tree_sitter::Language = tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into();
1262 let tsx_lang: tree_sitter::Language = tree_sitter_typescript::LANGUAGE_TSX.into();
1263 let matches = |reference: tree_sitter::Language| {
1264 lang.abi_version() == reference.abi_version()
1265 && lang.node_kind_count() == reference.node_kind_count()
1266 };
1267 matches(js_lang) || matches(ts_lang) || matches(tsx_lang)
1268}
1269
1270/// Returns `true` if `node` is an `arrow_function` or `function_expression`
1271/// that is passed directly as an argument to another call expression.
1272///
1273/// The check is: the node's immediate parent is an `arguments` node.
1274/// This distinguishes "argument closures" (passthrough for call attribution)
1275/// from "named const arrows" (`const f = () => ...`) whose parent is a
1276/// `variable_declarator`.
1277fn is_argument_closure(node: tree_sitter::Node<'_>) -> bool {
1278 matches!(node.kind(), "arrow_function" | "function_expression")
1279 && node.parent().is_some_and(|p| p.kind() == "arguments")
1280}
1281
1282/// Walk up the tree from `node`, passing through all transparent nodes
1283/// (`arrow_function`, `function_expression`, `statement_block`, `arguments`,
1284/// `call_expression`, `variable_declarator`, `lexical_declaration`,
1285/// `expression_statement`, etc.) until we reach a `function_declaration` or
1286/// `method_definition`, which is the nearest named enclosing function.
1287///
1288/// Returns `None` if no named function is found above `node` (e.g. the
1289/// closure lives at module/file scope).
1290fn nearest_named_fn_ancestor(node: tree_sitter::Node<'_>) -> Option<tree_sitter::Node<'_>> {
1291 let mut current = node.parent()?;
1292 loop {
1293 match current.kind() {
1294 // Named function boundaries: stop here.
1295 "function_declaration" | "method_definition" => return Some(current),
1296 // All other node kinds are transparent: keep climbing.
1297 //
1298 // The explicit list of transparent kinds (closures, declarations,
1299 // control-flow, etc.) is documented here for readability, but the
1300 // wildcard covers any new node kinds added in future grammar
1301 // versions — the walk is intentionally permissive so that
1302 // attribution never silently breaks on grammar updates.
1303 //
1304 // Transparent: "arrow_function", "function_expression",
1305 // "statement_block", "arguments", "call_expression",
1306 // "member_expression", "variable_declarator", "lexical_declaration",
1307 // "expression_statement", "return_statement", "await_expression",
1308 // "class_body", "class_declaration", "export_statement",
1309 // "if_statement", "while_statement", "for_statement", etc.
1310 _ => {}
1311 }
1312 match current.parent() {
1313 Some(p) => current = p,
1314 None => return None,
1315 }
1316 }
1317}
1318
1319/// Collect all `call_expression` callee names that appear directly inside
1320/// `closure` (an `arrow_function` or `function_expression`), visiting only
1321/// one level of closure depth (recursive closures are handled by the outer
1322/// DFS in [`extract_js_closure_call_edges`]).
1323///
1324/// Returns a `Vec` of `(callee_name, byte_offset)` pairs matching the callee
1325/// capture of the JS call query: either a bare `identifier` or the
1326/// `property_identifier` from a `member_expression`.
1327fn collect_calls_in_closure<'a>(
1328 source: &'a str,
1329 closure: tree_sitter::Node<'a>,
1330) -> Vec<(String, u32)> {
1331 let mut results = Vec::new();
1332 let mut stack: Vec<tree_sitter::Node<'_>> = Vec::new();
1333 // Start from the closure body, not the closure node itself.
1334 let body = closure.child_by_field_name("body").unwrap_or(closure);
1335 let mut cursor = body.walk();
1336 for child in body.children(&mut cursor) {
1337 stack.push(child);
1338 }
1339
1340 while let Some(node) = stack.pop() {
1341 if node.kind() == "call_expression" {
1342 // Extract callee: either `function: identifier` or
1343 // `function: member_expression property: property_identifier`.
1344 if let Some(fn_node) = node.child_by_field_name("function") {
1345 let callee_opt: Option<(String, u32)> = match fn_node.kind() {
1346 "identifier" => {
1347 let name = source[fn_node.start_byte()..fn_node.end_byte()].to_string();
1348 #[expect(
1349 clippy::cast_possible_truncation,
1350 reason = "byte offsets fit in u32"
1351 )]
1352 let byte = fn_node.start_byte() as u32;
1353 Some((name, byte))
1354 }
1355 "member_expression" => fn_node.child_by_field_name("property").map(|prop| {
1356 let name = source[prop.start_byte()..prop.end_byte()].to_string();
1357 #[expect(
1358 clippy::cast_possible_truncation,
1359 reason = "byte offsets fit in u32"
1360 )]
1361 let byte = prop.start_byte() as u32;
1362 (name, byte)
1363 }),
1364 _ => None,
1365 };
1366 if let Some(pair) = callee_opt {
1367 results.push(pair);
1368 }
1369 }
1370 }
1371 // Recurse into children, but do NOT descend into nested
1372 // arrow_function / function_expression nodes — those are either
1373 // handled by the outer DFS (if they are argument closures) or are
1374 // named const-arrows (which are their own defs and handled by the
1375 // main loop).
1376 if !matches!(node.kind(), "arrow_function" | "function_expression") {
1377 let mut c = node.walk();
1378 for child in node.children(&mut c) {
1379 stack.push(child);
1380 }
1381 }
1382 }
1383 results
1384}
1385
1386/// Post-pass for JS / TS / TSX files.
1387///
1388/// Two sub-passes:
1389///
1390/// **Pass 1 — closure-argument attribution**: For every `arrow_function` or
1391/// `function_expression` that is passed as an argument to a call (i.e., its
1392/// parent is an `arguments` node), attributes the inner calls to the nearest
1393/// named enclosing `function_declaration` or `method_definition`.
1394///
1395/// **Pass 2 — variable-declarator propagation**: For every `variable_declarator`
1396/// def that sits inside a named function, propagates all its calls to the
1397/// enclosing named function. This captures the outer call itself (e.g.
1398/// `useCallback(...)` in `const handler = useCallback(...)`) which the main
1399/// loop attributed to `handler` (the smallest enclosing def) but which also
1400/// belongs to the enclosing React component.
1401///
1402/// Both passes use `(byte_offset, name)` deduplication so calls are never
1403/// duplicated on a single def.
1404///
1405/// This fixes B-0005: React hooks (`useCallback`, `useMemo`, `useEffect`),
1406/// timer APIs (`setTimeout`, `setInterval`), array methods
1407/// (`.map`, `.filter`, `.forEach`, `.reduce`), and Express middleware
1408/// (`app.use`, `app.get`) all pass closures as arguments. Without this
1409/// post-pass, every function called INSIDE such a closure appears dead
1410/// because the call edge is attributed to the anonymous closure (which has
1411/// no def of its own) rather than to the enclosing named function.
1412///
1413/// **Rule** (from DESIGN §B-0005): anonymous closures and arrows are
1414/// "passthrough". Walk up to the nearest `function_declaration` /
1415/// `method_definition`. If none is found (closure at module scope), no edge
1416/// is emitted — matching the existing behaviour for top-level JS calls.
1417fn extract_js_closure_call_edges(
1418 source: &str,
1419 root: tree_sitter::Node<'_>,
1420 defs: &mut [Definition],
1421) {
1422 // Pass 1: walk tree — attribute calls inside argument-closure bodies to
1423 // the nearest named enclosing function.
1424 {
1425 let mut stack: Vec<tree_sitter::Node<'_>> = vec![root];
1426 while let Some(node) = stack.pop() {
1427 if is_argument_closure(node) {
1428 // Find the nearest named function ancestor.
1429 if let Some(named_fn) = nearest_named_fn_ancestor(node) {
1430 // Identify which def this named function corresponds to by
1431 // matching byte range and kind.
1432 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
1433 let fn_start = named_fn.start_byte() as u32;
1434 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
1435 let fn_end = named_fn.end_byte() as u32;
1436
1437 let def_idx = defs.iter().position(|d| {
1438 d.start_byte == fn_start
1439 && d.end_byte == fn_end
1440 && matches!(
1441 d.kind.as_str(),
1442 "function_declaration" | "method_definition"
1443 )
1444 });
1445
1446 if let Some(idx) = def_idx {
1447 // Collect calls inside this closure (one level deep;
1448 // nested closures are handled by their own DFS iteration).
1449 let inner_calls = collect_calls_in_closure(source, node);
1450 for (name, byte_offset) in inner_calls {
1451 // Skip self-recursive calls.
1452 if defs[idx].name == name {
1453 continue;
1454 }
1455 // Skip duplicate edges already present in the def.
1456 if defs[idx]
1457 .calls
1458 .iter()
1459 .any(|c| c.byte_offset == byte_offset && c.name == name)
1460 {
1461 continue;
1462 }
1463 defs[idx].calls.push(CallRef {
1464 name,
1465 qualified_path: None,
1466 receiver_type: None,
1467 byte_offset,
1468 resolved: None,
1469 });
1470 }
1471 }
1472 }
1473 }
1474 // Recurse into all children.
1475 let mut cursor = node.walk();
1476 for child in node.children(&mut cursor) {
1477 stack.push(child);
1478 }
1479 }
1480 }
1481
1482 // Pass 2: propagate calls from variable_declarator defs to their enclosing
1483 // named function defs.
1484 //
1485 // When a JS pattern like `const handler = useCallback(...)` creates a
1486 // `variable_declarator` def named `handler`, the main loop attributes
1487 // calls like `useCallback(...)` to `handler` (the smallest enclosing def).
1488 // Those calls also semantically belong to the enclosing named function
1489 // (e.g. `Component`) because they execute in its runtime context.
1490 //
1491 // We collect (variable_declarator_idx, enclosing_named_fn_idx) pairs first
1492 // to avoid mutating `defs` while iterating over it.
1493 let propagation_pairs: Vec<(usize, usize)> = defs
1494 .iter()
1495 .enumerate()
1496 .filter(|(_, d)| d.kind == "variable_declarator")
1497 .filter_map(|(var_idx, var_def)| {
1498 // Find the smallest named function def that strictly contains this
1499 // variable_declarator (not equal byte range — strict containment).
1500 let enclosing = defs
1501 .iter()
1502 .enumerate()
1503 .filter(|(enc_idx, enc)| {
1504 *enc_idx != var_idx
1505 && matches!(
1506 enc.kind.as_str(),
1507 "function_declaration" | "method_definition"
1508 )
1509 && enc.start_byte <= var_def.start_byte
1510 && var_def.end_byte <= enc.end_byte
1511 })
1512 .min_by_key(|(_, enc)| enc.end_byte - enc.start_byte)
1513 .map(|(i, _)| i);
1514 enclosing.map(|enc_idx| (var_idx, enc_idx))
1515 })
1516 .collect();
1517
1518 for (var_idx, enc_idx) in propagation_pairs {
1519 // Clone the calls to propagate (avoiding borrow-checker conflict).
1520 let calls_to_propagate: Vec<(String, u32)> = defs[var_idx]
1521 .calls
1522 .iter()
1523 .map(|c| (c.name.clone(), c.byte_offset))
1524 .collect();
1525 for (name, byte_offset) in calls_to_propagate {
1526 // Skip self-recursive edges.
1527 if defs[enc_idx].name == name {
1528 continue;
1529 }
1530 // Skip duplicates.
1531 if defs[enc_idx]
1532 .calls
1533 .iter()
1534 .any(|c| c.byte_offset == byte_offset && c.name == name)
1535 {
1536 continue;
1537 }
1538 defs[enc_idx].calls.push(CallRef {
1539 name,
1540 qualified_path: None,
1541 receiver_type: None,
1542 byte_offset,
1543 resolved: None,
1544 });
1545 }
1546 }
1547}
1548
1549// HCL: post-pass call-edge extraction for terraform_remote_state and module
1550// blocks. These are not function calls — they are HCL-specific structural
1551// references to other Terraform modules — so the generic
1552// `(function_call (identifier) @callee) @call` pattern in
1553// `call_query_for_extension("tf")` cannot capture them. This helper runs
1554// once per HCL file inside `extract_calls` (R2 + R3, Wave 3).
1555
1556/// Walk an HCL parse tree and emit CallRef entries for:
1557///
1558/// 1. `data.terraform_remote_state.<NAME>.outputs.<ATTR>` expressions:
1559/// one CallRef per reference, with `name = NAME` and
1560/// `qualified_path = Some("terraform_remote_state.NAME")`. These
1561/// connect the current file to the named remote-state module's outputs.
1562///
1563/// 2. `module "X" { source = "../X" }` blocks: one CallRef with
1564/// `name = X` (the label) and `qualified_path = Some("module.X")`.
1565/// The module reference connects to the module's directory in
1566/// `resolve_import` (HCL module-source resolution is not implemented
1567/// yet — the qualified_path carrier is the contract; resolve adds the
1568/// file lookup).
1569///
1570/// Each emitted CallRef is attached to the smallest enclosing definition
1571/// by byte range — matching the same heuristic used by `extract_calls`.
1572fn extract_hcl_call_edges(source: &str, root: tree_sitter::Node<'_>, defs: &mut [Definition]) {
1573 // Walk all named descendants iteratively.
1574 let mut stack: Vec<tree_sitter::Node<'_>> = vec![root];
1575 while let Some(node) = stack.pop() {
1576 // Defer to a function-style helper per node kind.
1577 match node.kind() {
1578 "expression" => hcl_visit_expression(source, node, defs),
1579 "block" => hcl_visit_block(source, node, defs),
1580 _ => {}
1581 }
1582 // Recurse into named children.
1583 let mut cursor = node.walk();
1584 for child in node.children(&mut cursor) {
1585 if child.is_named() {
1586 stack.push(child);
1587 }
1588 }
1589 }
1590}
1591
1592/// Inspect an HCL `expression` node for the
1593/// `data.terraform_remote_state.<NAME>.outputs.<ATTR>` reference pattern.
1594///
1595/// The expression tree looks like:
1596/// ```text
1597/// expression
1598/// variable_expr
1599/// identifier "data"
1600/// get_attr
1601/// identifier "terraform_remote_state"
1602/// get_attr
1603/// identifier "<NAME>"
1604/// get_attr
1605/// identifier "outputs"
1606/// get_attr
1607/// identifier "<ATTR>"
1608/// ```
1609fn hcl_visit_expression(source: &str, node: tree_sitter::Node<'_>, defs: &mut [Definition]) {
1610 // Collect children: must be `variable_expr` (with identifier="data")
1611 // followed by a chain of `get_attr` nodes (each with an `identifier` child).
1612 let mut cursor = node.walk();
1613 let mut child_iter = node.children(&mut cursor);
1614 let Some(first) = child_iter.next() else {
1615 return;
1616 };
1617 if first.kind() != "variable_expr" {
1618 return;
1619 }
1620 let Some(first_id) = first.child_by_field_name("name").or_else(|| {
1621 // Fallback: find first named child that's an identifier.
1622 let mut c = first.walk();
1623 first.children(&mut c).find(|n| n.kind() == "identifier")
1624 }) else {
1625 return;
1626 };
1627 if &source[first_id.start_byte()..first_id.end_byte()] != "data" {
1628 return;
1629 }
1630
1631 // Collect identifiers from the chain of get_attr.
1632 let mut chain: Vec<String> = Vec::new();
1633 for child in child_iter {
1634 if child.kind() != "get_attr" {
1635 return; // not a pure attribute chain
1636 }
1637 let mut gc = child.walk();
1638 let id = child.children(&mut gc).find(|n| n.kind() == "identifier");
1639 let Some(id_node) = id else { return };
1640 chain.push(source[id_node.start_byte()..id_node.end_byte()].to_string());
1641 }
1642
1643 // Expect: terraform_remote_state, <NAME>, outputs, <ATTR>
1644 if chain.len() < 2 || chain[0] != "terraform_remote_state" {
1645 return;
1646 }
1647 let name = chain[1].clone();
1648 let qualified_path = format!("terraform_remote_state.{name}");
1649
1650 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
1651 let call_byte = node.start_byte() as u32;
1652 attach_hcl_call(defs, call_byte, name.clone(), Some(qualified_path));
1653
1654 // I#54a — additionally emit an `outputs.<ATTR>` edge when the chain
1655 // reaches `outputs.<ATTR>`. This is the dependency edge a non-trivial
1656 // IaC repo actually has: the consumer references a *specific* upstream
1657 // output by name (`outputs.bar`), and the upstream tfstate file
1658 // contains an `output "bar" { ... }` block whose def name is exactly
1659 // `bar`. Emitting this second edge — with `name = ATTR` and a
1660 // qualified path that records the full chain — lets `resolve_calls`
1661 // bind the consumer to the upstream file regardless of path layout
1662 // (Aurora-style `infrastructure/shared/main.tf` and the simpler
1663 // `infra/foo.tf` shape both work).
1664 //
1665 // The legacy label edge above is preserved so the path-segment
1666 // resolution branch in `resolve_calls_inner` keeps firing on
1667 // existing corpora.
1668 if chain.len() >= 4 && chain[2] == "outputs" {
1669 let attr = chain[3].clone();
1670 let attr_qpath = format!("terraform_remote_state.{name}.outputs.{attr}");
1671 attach_hcl_call(defs, call_byte, attr, Some(attr_qpath));
1672 }
1673}
1674
1675/// Inspect an HCL `block` node for the `module "X" { source = "../X" }`
1676/// pattern. Emits one CallRef per matching block.
1677fn hcl_visit_block(source: &str, node: tree_sitter::Node<'_>, defs: &mut [Definition]) {
1678 // First child must be identifier="module".
1679 let mut cursor = node.walk();
1680 let children: Vec<tree_sitter::Node<'_>> = node.children(&mut cursor).collect();
1681 let Some(first) = children.first() else {
1682 return;
1683 };
1684 if first.kind() != "identifier" || &source[first.start_byte()..first.end_byte()] != "module" {
1685 return;
1686 }
1687 // Next child should be a string_lit (the module label).
1688 let label_node = children.iter().find(|c| c.kind() == "string_lit");
1689 let Some(label_node) = label_node else {
1690 return;
1691 };
1692 let mut lc = label_node.walk();
1693 let template = label_node
1694 .children(&mut lc)
1695 .find(|n| n.kind() == "template_literal");
1696 let Some(template) = template else {
1697 return;
1698 };
1699 let label = source[template.start_byte()..template.end_byte()].to_string();
1700 let qualified_path = format!("module.{label}");
1701
1702 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
1703 let call_byte = node.start_byte() as u32;
1704 attach_hcl_call(defs, call_byte, label, Some(qualified_path));
1705}
1706
1707/// Attach a synthesized HCL CallRef to the smallest enclosing definition.
1708/// Mirrors the byte-range attribution from `extract_calls`.
1709fn attach_hcl_call(
1710 defs: &mut [Definition],
1711 call_byte: u32,
1712 name: String,
1713 qualified_path: Option<String>,
1714) {
1715 let enclosing_idx = defs
1716 .iter()
1717 .enumerate()
1718 .filter(|(_, d)| d.start_byte <= call_byte && call_byte < d.end_byte)
1719 .min_by_key(|(_, d)| (d.end_byte - d.start_byte, is_callable_def_priority(&d.kind)))
1720 .map(|(i, _)| i);
1721 if let Some(idx) = enclosing_idx {
1722 // Skip self-recursive emission (would happen if the enclosing def
1723 // happens to share the same `name` as the synthesized callee).
1724 if defs[idx].name != name {
1725 defs[idx].calls.push(CallRef {
1726 name,
1727 qualified_path,
1728 receiver_type: None,
1729 byte_offset: call_byte,
1730 resolved: None,
1731 });
1732 }
1733 }
1734}
1735
1736/// Walk a C / C++ parse tree and emit synthetic CallRef entries for every
1737/// function identifier appearing inside a struct or array initializer
1738/// literal (I#55, 4.1.5).
1739///
1740/// Two syntactic forms produce edges:
1741///
1742/// 1. **Designated initializer** — `(initializer_pair value: (identifier))`
1743/// — corresponds to `.field = funcname` syntax (Linux `file_operations`,
1744/// ALSA `snd_pcm_ops`, etc):
1745/// ```c
1746/// static const struct file_operations my_fops = {
1747/// .read = my_read, // ← edge: my_fops → my_read
1748/// .write = my_write, // ← edge: my_fops → my_write
1749/// };
1750/// ```
1751///
1752/// 2. **Positional initializer** — `(initializer_list (identifier))` —
1753/// corresponds to bare identifier slots inside `{ ... }` (redis
1754/// `redisCommandTable`, libuv handle vtables):
1755/// ```c
1756/// struct redisCommand cmds[] = {
1757/// {"get", getCommand, 2}, // ← edge: cmds → getCommand
1758/// {"set", setCommand, -3}, // ← edge: cmds → setCommand
1759/// };
1760/// ```
1761///
1762/// Each emitted CallRef is attached to the smallest enclosing definition by
1763/// byte range (same heuristic as `extract_calls` / `attach_hcl_call`). For
1764/// the typical use case the enclosing def is the array/struct declaration
1765/// itself (e.g. `my_fops` or `cmds`). When the enclosing def is itself the
1766/// referenced function (e.g. a struct field designator inside a function
1767/// body) the self-recursive edge is suppressed.
1768///
1769/// Non-identifier initializer values (string literals, integers, nested
1770/// braces) are skipped by tree-sitter's `kind() == "identifier"` filter, so
1771/// no false-positive edges to undefined symbols are emitted. The resolver
1772/// (`resolve_calls`) then either binds the identifier to a real function
1773/// def (preserved as a real edge) or leaves `resolved = None` (dropped at
1774/// edge-construction time, matching how all unresolved CallRefs behave).
1775fn extract_c_struct_init_edges(source: &str, root: tree_sitter::Node<'_>, defs: &mut [Definition]) {
1776 // Iterative DFS to avoid stack-blowup on deeply-nested initializer
1777 // tables (some kernel drivers nest 4–5 levels of struct-of-struct).
1778 let mut stack: Vec<tree_sitter::Node<'_>> = vec![root];
1779 while let Some(node) = stack.pop() {
1780 match node.kind() {
1781 // Designated initializer: `.field = funcname`. The grammar
1782 // exposes the rhs as a positional child, but using the `value:`
1783 // field selector picks it unambiguously when present and falls
1784 // back to the last named child otherwise. We only consume bare
1785 // identifier values — string_literal, number_literal, and
1786 // nested initializer_list are dispatched separately (the
1787 // recursion below re-enters nested initializer_list nodes).
1788 "initializer_pair" => {
1789 let value_node = node.child_by_field_name("value").or_else(|| {
1790 // Fallback: last named child that isn't a designator.
1791 let mut c = node.walk();
1792 node.children(&mut c)
1793 .filter(|n| n.is_named() && n.kind() != "field_designator")
1794 .last()
1795 });
1796 if let Some(v) = value_node
1797 && v.kind() == "identifier"
1798 {
1799 emit_c_init_edge(source, v, defs);
1800 }
1801 }
1802 // Positional initializer: bare identifier directly inside an
1803 // `initializer_list`. We do NOT recurse from here to pick up
1804 // identifiers — the outer DFS already visits every node, and
1805 // emitting on direct identifier children of `initializer_list`
1806 // covers `{"name", funcname, 2}` patterns. Nested
1807 // `initializer_list` children (e.g. `{{...},{...}}`) are
1808 // popped onto the stack and processed in their own iteration.
1809 "initializer_list" => {
1810 let mut c = node.walk();
1811 for child in node.children(&mut c) {
1812 if child.kind() == "identifier" {
1813 emit_c_init_edge(source, child, defs);
1814 }
1815 }
1816 }
1817 _ => {}
1818 }
1819 // Recurse into named children. The DFS visits the entire subtree
1820 // so nested `initializer_pair` / `initializer_list` nodes are
1821 // reached without special handling.
1822 let mut cursor = node.walk();
1823 for child in node.children(&mut cursor) {
1824 if child.is_named() {
1825 stack.push(child);
1826 }
1827 }
1828 }
1829}
1830
1831/// Emit one synthetic CallRef edge for a C/C++ struct-literal fnptr
1832/// reference. `ident_node` must be an `identifier` node; its text becomes
1833/// the callee name. The edge is attached to the smallest enclosing
1834/// definition (typically the surrounding `declaration` def for the table
1835/// variable itself).
1836fn emit_c_init_edge(source: &str, ident_node: tree_sitter::Node<'_>, defs: &mut [Definition]) {
1837 let name = source[ident_node.start_byte()..ident_node.end_byte()].to_string();
1838 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
1839 let call_byte = ident_node.start_byte() as u32;
1840
1841 let enclosing_idx = defs
1842 .iter()
1843 .enumerate()
1844 .filter(|(_, d)| d.start_byte <= call_byte && call_byte < d.end_byte)
1845 .min_by_key(|(_, d)| (d.end_byte - d.start_byte, is_callable_def_priority(&d.kind)))
1846 .map(|(i, _)| i);
1847
1848 let Some(idx) = enclosing_idx else {
1849 return;
1850 };
1851 // Skip self-recursive emission (would happen if the enclosing def is
1852 // itself the referenced function — e.g. a static initializer inside a
1853 // function body whose .field designates the same function).
1854 if defs[idx].name == name {
1855 return;
1856 }
1857 // Skip duplicate edges (same caller, same name, same byte offset) —
1858 // belt-and-braces against the DFS visiting an identifier through both
1859 // its parent initializer_pair and a containing initializer_list scan.
1860 if defs[idx]
1861 .calls
1862 .iter()
1863 .any(|c| c.byte_offset == call_byte && c.name == name)
1864 {
1865 return;
1866 }
1867 defs[idx].calls.push(CallRef {
1868 name,
1869 qualified_path: None,
1870 receiver_type: None,
1871 byte_offset: call_byte,
1872 resolved: None,
1873 });
1874}
1875
1876/// Infer method-call receiver types from local context within a parse tree.
1877///
1878/// Returns a map from `byte_offset_of_@callee_capture` to a receiver type string.
1879///
1880/// Dispatches to a language-specific collector:
1881///
1882/// - **Rust**: [`collect_rust_receiver_types`] — three heuristic cases:
1883/// 1. `self.method()` inside `impl Foo { … }` → `"Foo"`.
1884/// 2. `x.method()` where `x: Bar` is a function parameter → `"Bar"`.
1885/// 3. `x.method()` after `let x = Foo::new()` → `"Foo"`.
1886///
1887/// - **Python**: [`collect_python_receiver_types`] — two heuristic cases:
1888/// 1. `self.method()` inside a class method → class name from enclosing
1889/// `class_definition`.
1890/// 2. `instance.method()` where `instance: ClassName` type annotation or
1891/// `instance = ClassName(...)` assignment is visible in the same scope.
1892///
1893/// - **Go**: [`collect_go_receiver_types`] — one heuristic case:
1894/// 1. `recv.Method()` inside a `method_declaration` where `recv` is the
1895/// named receiver parameter → receiver type from the method signature.
1896///
1897/// This is heuristic, not type-inference-complete. Unknown/ambiguous cases
1898/// produce no entry in the map; `extract_calls` leaves those `receiver_type = None`.
1899fn infer_receiver_types(
1900 source: &str,
1901 tree: &tree_sitter::Tree,
1902 language: &tree_sitter::Language,
1903) -> HashMap<u32, String> {
1904 let mut map: HashMap<u32, String> = HashMap::new();
1905
1906 if languages::is_rust_language(language) {
1907 collect_rust_receiver_types(source, tree.root_node(), &mut map);
1908 } else if languages::is_python_language(language) {
1909 collect_python_receiver_types(source, tree.root_node(), &mut map);
1910 } else if languages::is_go_language(language) {
1911 collect_go_receiver_types(source, tree.root_node(), &mut map);
1912 }
1913 // Other languages: no receiver inference — leave map empty.
1914
1915 map
1916}
1917
1918/// Walk the Rust parse tree and fill `map` with receiver-type inference.
1919///
1920/// This is a recursive descent that tracks:
1921/// - The current `impl Foo` or `impl Foo for Bar` type name (for `self.*` calls).
1922/// - Parameter type annotations (for `x: SomeType` → `x` has type `SomeType`).
1923/// - Constructor let-bindings (`let x = Foo::new()` → `x` has type `Foo`).
1924fn collect_rust_receiver_types(
1925 source: &str,
1926 node: tree_sitter::Node<'_>,
1927 map: &mut HashMap<u32, String>,
1928) {
1929 // We use a stack-based walk to avoid deep recursion on large files.
1930 // Each stack entry carries (node, impl_type_context).
1931 let mut stack: Vec<(tree_sitter::Node<'_>, Option<String>)> = vec![(node, None)];
1932
1933 while let Some((n, impl_ctx)) = stack.pop() {
1934 match n.kind() {
1935 "impl_item" => {
1936 // Extract `impl Foo` or `impl Trait for Foo` → capture the `for` type.
1937 // tree-sitter-rust shape: `(impl_item type: (type_identifier) @type)` for
1938 // inherent impls, and `(impl_item trait: … type: (type_identifier) @type)`
1939 // for trait impls. Both have a child named `type`.
1940 let impl_type = extract_impl_self_type(source, n);
1941 let new_ctx = impl_type.or_else(|| impl_ctx.clone());
1942 let mut cursor = n.walk();
1943 for child in n.children(&mut cursor) {
1944 stack.push((child, new_ctx.clone()));
1945 }
1946 }
1947 "function_item" => {
1948 // Build parameter bindings: (param_name → type_name).
1949 let param_types = extract_param_types(source, n);
1950 // Build let-binding type map from constructor calls.
1951 let let_types = extract_let_binding_types(source, n);
1952 // Annotate call sites within this function body.
1953 annotate_method_calls(
1954 source,
1955 n,
1956 impl_ctx.as_deref(),
1957 ¶m_types,
1958 &let_types,
1959 map,
1960 );
1961 // Do NOT recurse into function_item children with the outer stack —
1962 // function bodies are fully handled by annotate_method_calls.
1963 // (Nested fn items would re-enter via their own impl_item context.)
1964 // Push children with same impl_ctx so nested impl blocks are found.
1965 let mut cursor = n.walk();
1966 for child in n.children(&mut cursor) {
1967 stack.push((child, impl_ctx.clone()));
1968 }
1969 }
1970 _ => {
1971 let mut cursor = n.walk();
1972 for child in n.children(&mut cursor) {
1973 stack.push((child, impl_ctx.clone()));
1974 }
1975 }
1976 }
1977 }
1978}
1979
1980/// Extract the self type from an `impl_item` node.
1981///
1982/// For `impl Foo { … }` → `Some("Foo")`.
1983/// For `impl Trait for Foo { … }` → `Some("Foo")` (the concrete `for` type).
1984fn extract_impl_self_type(source: &str, impl_node: tree_sitter::Node<'_>) -> Option<String> {
1985 // tree-sitter-rust: impl_item has a field named "type" for the self type.
1986 // For `impl Foo for Bar { }`, "type" is Bar; for `impl Foo { }`, "type" is Foo.
1987 let type_node = impl_node.child_by_field_name("type")?;
1988 Some(source[type_node.start_byte()..type_node.end_byte()].to_string())
1989}
1990
1991/// Extract parameter name → type mappings from a function signature.
1992///
1993/// Handles `fn foo(x: Bar, y: Baz)` → `{"x": "Bar", "y": "Baz"}`.
1994/// The `self`/`&self`/`&mut self` parameter is skipped (handled via impl_ctx).
1995fn extract_param_types(source: &str, fn_node: tree_sitter::Node<'_>) -> HashMap<String, String> {
1996 let mut params: HashMap<String, String> = HashMap::new();
1997 let Some(params_node) = fn_node.child_by_field_name("parameters") else {
1998 return params;
1999 };
2000 let mut cursor = params_node.walk();
2001 for param in params_node.children(&mut cursor) {
2002 if param.kind() == "parameter" {
2003 // parameter has children: pattern (identifier) and type
2004 let mut param_name = None;
2005 let mut param_type = None;
2006 let mut pc = param.walk();
2007 for child in param.children(&mut pc) {
2008 match child.kind() {
2009 "identifier" | "mutable_specifier" if param_name.is_none() => {
2010 let text = source[child.start_byte()..child.end_byte()].to_string();
2011 if text != "mut" {
2012 param_name = Some(text);
2013 }
2014 }
2015 "type_identifier"
2016 | "generic_type"
2017 | "reference_type"
2018 | "scoped_type_identifier"
2019 if param_type.is_none() =>
2020 {
2021 // Extract the base type identifier from potentially complex types.
2022 param_type = Some(extract_base_type(source, child));
2023 }
2024 _ => {}
2025 }
2026 }
2027 if let (Some(name), Some(ty)) = (param_name, param_type)
2028 && !ty.is_empty()
2029 {
2030 params.insert(name, ty);
2031 }
2032 }
2033 // Also handle typed_pattern in newer grammars
2034 if param.kind() == "typed_pattern" {
2035 let mut name_part = None;
2036 let mut type_part = None;
2037 let mut pc = param.walk();
2038 for child in param.children(&mut pc) {
2039 if child.kind() == "identifier" && name_part.is_none() {
2040 name_part = Some(source[child.start_byte()..child.end_byte()].to_string());
2041 } else if matches!(
2042 child.kind(),
2043 "type_identifier"
2044 | "generic_type"
2045 | "reference_type"
2046 | "scoped_type_identifier"
2047 ) && type_part.is_none()
2048 {
2049 type_part = Some(extract_base_type(source, child));
2050 }
2051 }
2052 if let (Some(name), Some(ty)) = (name_part, type_part)
2053 && !ty.is_empty()
2054 {
2055 params.insert(name, ty);
2056 }
2057 }
2058 }
2059 params
2060}
2061
2062/// Extract the base `TypeIdentifier` from a potentially complex type node.
2063///
2064/// For `Bar`, `&Bar`, `&mut Bar`, `Bar<T>` → returns `"Bar"`.
2065/// For `module::Bar` → returns `"Bar"` (bare name for matching).
2066fn extract_base_type(source: &str, node: tree_sitter::Node<'_>) -> String {
2067 match node.kind() {
2068 "type_identifier" => source[node.start_byte()..node.end_byte()].to_string(),
2069 "generic_type" | "reference_type" | "mutable_specifier" | "scoped_type_identifier" => {
2070 // Recurse to find the innermost type_identifier
2071 let mut cursor = node.walk();
2072 for child in node.children(&mut cursor) {
2073 let t = extract_base_type(source, child);
2074 if !t.is_empty() {
2075 return t;
2076 }
2077 }
2078 String::new()
2079 }
2080 _ => {
2081 // For other nodes, try children
2082 let mut cursor = node.walk();
2083 for child in node.children(&mut cursor) {
2084 if child.kind() == "type_identifier" {
2085 return source[child.start_byte()..child.end_byte()].to_string();
2086 }
2087 }
2088 String::new()
2089 }
2090 }
2091}
2092
2093/// Scan a function body for `let x = Foo::new()` patterns.
2094///
2095/// Returns a map from local variable name to the constructor type name.
2096/// E.g., `let x = Foo::new();` → `{"x": "Foo"}`.
2097fn extract_let_binding_types(
2098 source: &str,
2099 fn_node: tree_sitter::Node<'_>,
2100) -> HashMap<String, String> {
2101 let mut bindings: HashMap<String, String> = HashMap::new();
2102
2103 let Some(body) = fn_node.child_by_field_name("body") else {
2104 return bindings;
2105 };
2106
2107 // Walk the function body looking for let_declaration nodes.
2108 let mut stack = vec![body];
2109 while let Some(n) = stack.pop() {
2110 if n.kind() == "let_declaration" {
2111 // let_declaration: pattern (identifier) + value (call_expression or …)
2112 let mut binding_name = None;
2113 let mut constructor_type = None;
2114 let mut cursor = n.walk();
2115 for child in n.children(&mut cursor) {
2116 match child.kind() {
2117 "identifier" if binding_name.is_none() => {
2118 binding_name =
2119 Some(source[child.start_byte()..child.end_byte()].to_string());
2120 }
2121 "call_expression" => {
2122 // Look for `Foo::new()` or `Foo::from(…)` patterns.
2123 // The function child of call_expression is a scoped_identifier.
2124 if let Some(func) = child.child_by_field_name("function")
2125 && func.kind() == "scoped_identifier"
2126 {
2127 // scoped_identifier path: `Foo::new` — extract head segment.
2128 let full = source[func.start_byte()..func.end_byte()].to_string();
2129 let head = full.split("::").next().unwrap_or("").to_string();
2130 if !head.is_empty()
2131 && head.chars().next().is_some_and(char::is_uppercase)
2132 {
2133 constructor_type = Some(head);
2134 }
2135 }
2136 }
2137 _ => {}
2138 }
2139 }
2140 if let (Some(name), Some(ty)) = (binding_name, constructor_type) {
2141 bindings.insert(name, ty);
2142 }
2143 }
2144 // Push children for recursive walk.
2145 let mut cursor = n.walk();
2146 for child in n.children(&mut cursor) {
2147 stack.push(child);
2148 }
2149 }
2150
2151 bindings
2152}
2153
2154/// Walk a function body and annotate method-call byte offsets with receiver types.
2155///
2156/// A "method call" in the ripvec call query is:
2157/// `(call_expression function: (field_expression field: (field_identifier) @callee))`
2158///
2159/// The receiver is the `value` child of `field_expression`. This function
2160/// checks whether the receiver is:
2161/// - `self` → use `impl_ctx` type.
2162/// - An identifier matching a parameter type in `param_types`.
2163/// - An identifier matching a constructor let-binding in `let_types`.
2164fn annotate_method_calls(
2165 source: &str,
2166 fn_node: tree_sitter::Node<'_>,
2167 impl_ctx: Option<&str>,
2168 param_types: &HashMap<String, String>,
2169 let_types: &HashMap<String, String>,
2170 map: &mut HashMap<u32, String>,
2171) {
2172 // Walk the entire function (including its body) looking for call_expression nodes.
2173 let mut stack = vec![fn_node];
2174 while let Some(n) = stack.pop() {
2175 if n.kind() == "call_expression"
2176 && let Some(func) = n.child_by_field_name("function")
2177 && func.kind() == "field_expression"
2178 {
2179 // field_expression: value (receiver) + field (method name identifier)
2180 if let (Some(recv), Some(field)) = (
2181 func.child_by_field_name("value"),
2182 func.child_by_field_name("field"),
2183 ) {
2184 let recv_text = source[recv.start_byte()..recv.end_byte()].to_string();
2185 let receiver_type = if recv_text == "self" || recv_text == "*self" {
2186 impl_ctx.map(str::to_owned)
2187 } else {
2188 // Strip ref sigils for lookup.
2189 let base = recv_text
2190 .trim_start_matches('*')
2191 .trim_start_matches('&')
2192 .trim();
2193 param_types
2194 .get(base)
2195 .or_else(|| let_types.get(base))
2196 .cloned()
2197 };
2198
2199 if let Some(ty) = receiver_type {
2200 // The `@callee` capture byte offset is the start of the field node.
2201 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
2202 let field_byte = field.start_byte() as u32;
2203 map.insert(field_byte, ty);
2204 }
2205 }
2206 }
2207 let mut cursor = n.walk();
2208 for child in n.children(&mut cursor) {
2209 stack.push(child);
2210 }
2211 }
2212}
2213
2214// ── Python receiver-type heuristic ───────────────────────────────────
2215
2216/// Walk the Python parse tree and fill `map` with receiver-type inference.
2217///
2218/// Two heuristic cases:
2219///
2220/// 1. **`self.method()` inside a method** — when the `attribute` call receiver is
2221/// the literal text `self`, the receiver type is the name of the nearest
2222/// enclosing `class_definition`.
2223///
2224/// 2. **`instance.method()` with a type annotation or constructor call** —
2225/// when a function parameter has a PEP 484 annotation `param: ClassName` or
2226/// when a local assignment `param = ClassName(...)` precedes the call, the
2227/// receiver type is bound to `ClassName`.
2228///
2229/// The Python call query captures:
2230/// - `(call function: (attribute attribute: (identifier) @callee)) @call`
2231///
2232/// Within the `attribute` node, `value` is the receiver expression and
2233/// `attribute` is the method name (the `@callee` capture). The `@callee`
2234/// byte offset is the start of the `attribute` child identifier node.
2235fn collect_python_receiver_types(
2236 source: &str,
2237 root: tree_sitter::Node<'_>,
2238 map: &mut HashMap<u32, String>,
2239) {
2240 // Stack carries (node, class_ctx: Option<String>).
2241 // class_ctx is the name of the nearest enclosing class_definition.
2242 let mut stack: Vec<(tree_sitter::Node<'_>, Option<String>)> = vec![(root, None)];
2243
2244 while let Some((n, class_ctx)) = stack.pop() {
2245 match n.kind() {
2246 "class_definition" => {
2247 // Extract the class name from the `name` child.
2248 let class_name = n
2249 .child_by_field_name("name")
2250 .map(|c| source[c.start_byte()..c.end_byte()].to_string());
2251 let new_ctx = class_name.or_else(|| class_ctx.clone());
2252 let mut cursor = n.walk();
2253 for child in n.children(&mut cursor) {
2254 stack.push((child, new_ctx.clone()));
2255 }
2256 }
2257 "function_definition" => {
2258 // Build parameter annotation map: param_name → type_name.
2259 let param_types = extract_python_param_types(source, n);
2260 // Build local assignment map: var_name → constructor type.
2261 let let_types = extract_python_assignment_types(source, n);
2262 // Annotate attribute call sites within this function body.
2263 annotate_python_method_calls(
2264 source,
2265 n,
2266 class_ctx.as_deref(),
2267 ¶m_types,
2268 &let_types,
2269 map,
2270 );
2271 // Push children with same class_ctx so nested classes are found.
2272 let mut cursor = n.walk();
2273 for child in n.children(&mut cursor) {
2274 stack.push((child, class_ctx.clone()));
2275 }
2276 }
2277 _ => {
2278 let mut cursor = n.walk();
2279 for child in n.children(&mut cursor) {
2280 stack.push((child, class_ctx.clone()));
2281 }
2282 }
2283 }
2284 }
2285}
2286
2287/// Extract Python parameter name → type annotation mappings.
2288///
2289/// Handles PEP 484 style: `def foo(self, x: Bar, y: Baz) -> ...`.
2290/// The `self` parameter is excluded (handled via class_ctx).
2291/// Returns `{"x": "Bar", "y": "Baz"}`.
2292fn extract_python_param_types(
2293 source: &str,
2294 fn_node: tree_sitter::Node<'_>,
2295) -> HashMap<String, String> {
2296 let mut params: HashMap<String, String> = HashMap::new();
2297 let Some(params_node) = fn_node.child_by_field_name("parameters") else {
2298 return params;
2299 };
2300
2301 // Parameters node children include `identifier`, `typed_parameter`,
2302 // `typed_default_parameter`, and others.
2303 let mut cursor = params_node.walk();
2304 for param in params_node.children(&mut cursor) {
2305 match param.kind() {
2306 "typed_parameter" => {
2307 // (typed_parameter (identifier) @name type: (type) @type)
2308 // First identifier child is the name; type child is the type.
2309 let mut name_text = None;
2310 let mut type_text = None;
2311 let mut pc = param.walk();
2312 for child in param.children(&mut pc) {
2313 match child.kind() {
2314 "identifier" if name_text.is_none() => {
2315 let t = source[child.start_byte()..child.end_byte()].to_string();
2316 if t != "self" && t != "cls" {
2317 name_text = Some(t);
2318 }
2319 }
2320 "type" | "identifier" | "attribute"
2321 if type_text.is_none() && name_text.is_some() =>
2322 {
2323 // The type child in tree-sitter-python is a `type` node
2324 // whose text is the annotation expression. Extract the
2325 // base identifier (handle `Optional[Bar]`, `List[Bar]`, etc.)
2326 type_text = Some(extract_python_base_type(source, child));
2327 }
2328 _ => {}
2329 }
2330 }
2331 if let (Some(name), Some(ty)) = (name_text, type_text)
2332 && !ty.is_empty()
2333 && !ty.eq("self")
2334 && !ty.eq("cls")
2335 {
2336 params.insert(name, ty);
2337 }
2338 }
2339 "typed_default_parameter" => {
2340 // (typed_default_parameter name: (identifier) type: (type) value: …)
2341 let name_node = param.child_by_field_name("name");
2342 let type_node = param.child_by_field_name("type");
2343 if let (Some(nn), Some(tn)) = (name_node, type_node) {
2344 let name = source[nn.start_byte()..nn.end_byte()].to_string();
2345 if name != "self" && name != "cls" {
2346 let ty = extract_python_base_type(source, tn);
2347 if !ty.is_empty() {
2348 params.insert(name, ty);
2349 }
2350 }
2351 }
2352 }
2353 _ => {}
2354 }
2355 }
2356 params
2357}
2358
2359/// Extract the base type name from a Python type annotation node.
2360///
2361/// For `Bar` → `"Bar"`. For `Optional[Bar]` or `List[Bar]` → `"Bar"`.
2362/// For `module.Class` → `"Class"` (bare name only).
2363fn extract_python_base_type(source: &str, node: tree_sitter::Node<'_>) -> String {
2364 match node.kind() {
2365 "identifier" => source[node.start_byte()..node.end_byte()].to_string(),
2366 // tree-sitter-python wraps annotations in a `type` node
2367 "type" => {
2368 let mut cursor = node.walk();
2369 for child in node.children(&mut cursor) {
2370 let t = extract_python_base_type(source, child);
2371 if !t.is_empty() {
2372 return t;
2373 }
2374 }
2375 String::new()
2376 }
2377 // Generic alias: `Optional[Bar]` — the first identifier child is `Optional`,
2378 // the subscript child contains `Bar`. We want the subscript content.
2379 "subscript" => {
2380 // subscript has value (e.g. Optional) and subscript (e.g. Bar).
2381 // Return the subscript's base type (the inner type argument).
2382 if let Some(sub) = node.child_by_field_name("subscript") {
2383 return extract_python_base_type(source, sub);
2384 }
2385 // Fall back: first identifier
2386 let mut cursor = node.walk();
2387 for child in node.children(&mut cursor) {
2388 if child.kind() == "identifier" {
2389 return source[child.start_byte()..child.end_byte()].to_string();
2390 }
2391 }
2392 String::new()
2393 }
2394 // Attribute node `module.Class` → take last identifier
2395 "attribute" => {
2396 if let Some(attr) = node.child_by_field_name("attribute") {
2397 return source[attr.start_byte()..attr.end_byte()].to_string();
2398 }
2399 String::new()
2400 }
2401 _ => {
2402 // Try first identifier child
2403 let mut cursor = node.walk();
2404 for child in node.children(&mut cursor) {
2405 if child.kind() == "identifier" {
2406 return source[child.start_byte()..child.end_byte()].to_string();
2407 }
2408 }
2409 String::new()
2410 }
2411 }
2412}
2413
2414/// Scan a Python function body for `x = ClassName(...)` assignment patterns.
2415///
2416/// Returns a map from local variable name to constructor type.
2417/// E.g., `x = Foo()` → `{"x": "Foo"}`.
2418/// Also handles `x = module.ClassName(...)` → `{"x": "ClassName"}`.
2419fn extract_python_assignment_types(
2420 source: &str,
2421 fn_node: tree_sitter::Node<'_>,
2422) -> HashMap<String, String> {
2423 let mut bindings: HashMap<String, String> = HashMap::new();
2424 let Some(body) = fn_node.child_by_field_name("body") else {
2425 return bindings;
2426 };
2427
2428 let mut stack = vec![body];
2429 while let Some(n) = stack.pop() {
2430 if n.kind() == "assignment" {
2431 // assignment: left = right
2432 // We want: left is a simple identifier, right is a call whose
2433 // function is an identifier starting with an uppercase letter
2434 // (Python convention for class names).
2435 let left = n.child_by_field_name("left");
2436 let right = n.child_by_field_name("right");
2437 if let (Some(lhs), Some(rhs)) = (left, right)
2438 && lhs.kind() == "identifier"
2439 && rhs.kind() == "call"
2440 && let Some(func) = rhs.child_by_field_name("function")
2441 {
2442 let var_name = source[lhs.start_byte()..lhs.end_byte()].to_string();
2443 let constructor_type = match func.kind() {
2444 "identifier" => {
2445 let t = source[func.start_byte()..func.end_byte()].to_string();
2446 // Class names are conventionally uppercase-first
2447 if t.chars().next().is_some_and(char::is_uppercase) {
2448 Some(t)
2449 } else {
2450 None
2451 }
2452 }
2453 "attribute" => {
2454 // `module.ClassName(...)` — take the `attribute` part
2455 func.child_by_field_name("attribute")
2456 .map(|a| source[a.start_byte()..a.end_byte()].to_string())
2457 }
2458 _ => None,
2459 };
2460 if let Some(ty) = constructor_type {
2461 bindings.insert(var_name, ty);
2462 }
2463 }
2464 }
2465 let mut cursor = n.walk();
2466 for child in n.children(&mut cursor) {
2467 stack.push(child);
2468 }
2469 }
2470 bindings
2471}
2472
2473/// Walk a Python function body and annotate attribute-call byte offsets.
2474///
2475/// A Python method call is:
2476/// `(call function: (attribute value: <receiver> attribute: (identifier) @callee))`
2477///
2478/// The receiver is the `value` child of `attribute`. This function checks:
2479/// - `self` → use the enclosing class name (`class_ctx`).
2480/// - An identifier matching a parameter type in `param_types`.
2481/// - An identifier matching a constructor assignment in `let_types`.
2482fn annotate_python_method_calls(
2483 source: &str,
2484 fn_node: tree_sitter::Node<'_>,
2485 class_ctx: Option<&str>,
2486 param_types: &HashMap<String, String>,
2487 let_types: &HashMap<String, String>,
2488 map: &mut HashMap<u32, String>,
2489) {
2490 let mut stack = vec![fn_node];
2491 while let Some(n) = stack.pop() {
2492 if n.kind() == "call"
2493 && let Some(func) = n.child_by_field_name("function")
2494 && func.kind() == "attribute"
2495 && let (Some(recv_node), Some(attr_node)) = (
2496 func.child_by_field_name("object"),
2497 func.child_by_field_name("attribute"),
2498 )
2499 {
2500 // attribute node: object (receiver) + attribute (method name)
2501 let recv_text = source[recv_node.start_byte()..recv_node.end_byte()].to_string();
2502 let receiver_type = if recv_text == "self" || recv_text == "cls" {
2503 class_ctx.map(str::to_owned)
2504 } else if recv_node.kind() == "identifier" {
2505 param_types
2506 .get(&recv_text)
2507 .or_else(|| let_types.get(&recv_text))
2508 .cloned()
2509 } else {
2510 None
2511 };
2512
2513 if let Some(ty) = receiver_type {
2514 // The `@callee` capture byte offset is the `attribute` child.
2515 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
2516 let attr_byte = attr_node.start_byte() as u32;
2517 map.insert(attr_byte, ty);
2518 }
2519 }
2520 let mut cursor = n.walk();
2521 for child in n.children(&mut cursor) {
2522 stack.push(child);
2523 }
2524 }
2525}
2526
2527// ── Python class hierarchy (MRO) extraction ───────────────────────────
2528
2529/// Walk the Python parse tree and add class → parent-names entries to `out`.
2530///
2531/// tree-sitter-python shape:
2532/// ```text
2533/// (class_definition
2534/// name: (identifier) @child
2535/// superclasses: (argument_list
2536/// (identifier) @parent ; bare parent: class Foo(Bar):
2537/// (attribute attribute: (identifier) @parent) ; qualified: class Foo(mod.Bar):
2538/// (keyword_argument …) ; ignored: class Foo(Bar, metaclass=Meta):
2539/// )?
2540/// ...)
2541/// ```
2542///
2543/// Each map entry's key is a class defined in the file; the value is the
2544/// ordered list of declared parent class **names** (the trailing `attribute`
2545/// segment, so `mod.Bar` becomes `Bar`). Classes with no declared parents
2546/// still get an entry (empty `Vec`) so a downstream MRO walk can tell
2547/// "known class with no parents" from "unknown class".
2548fn extract_python_class_hierarchy_node(
2549 source: &str,
2550 root: tree_sitter::Node<'_>,
2551 out: &mut HashMap<String, Vec<String>>,
2552) {
2553 let mut stack = vec![root];
2554 while let Some(n) = stack.pop() {
2555 if n.kind() == "class_definition"
2556 && let Some(name_node) = n.child_by_field_name("name")
2557 {
2558 let class_name = source[name_node.start_byte()..name_node.end_byte()].to_string();
2559 let mut parents: Vec<String> = Vec::new();
2560 if let Some(superclasses) = n.child_by_field_name("superclasses") {
2561 // superclasses is an `argument_list`; iterate its children and
2562 // collect identifiers / attribute trailing-names. Skip
2563 // keyword_argument entries (metaclass=…, etc.) and punctuation.
2564 let mut sc = superclasses.walk();
2565 for child in superclasses.children(&mut sc) {
2566 match child.kind() {
2567 "identifier" => {
2568 let t = source[child.start_byte()..child.end_byte()].to_string();
2569 parents.push(t);
2570 }
2571 "attribute" => {
2572 // module.Cls → take the trailing `attribute` segment.
2573 if let Some(attr) = child.child_by_field_name("attribute") {
2574 parents
2575 .push(source[attr.start_byte()..attr.end_byte()].to_string());
2576 }
2577 }
2578 // Drop keyword_argument, "(", ")", ",", comments, etc.
2579 _ => {}
2580 }
2581 }
2582 }
2583 out.insert(class_name, parents);
2584 }
2585 let mut cursor = n.walk();
2586 for child in n.children(&mut cursor) {
2587 stack.push(child);
2588 }
2589 }
2590}
2591
2592/// Extract the Python `class → [parents]` map from a single source file by
2593/// parsing it with tree-sitter-python.
2594///
2595/// Returns an empty map when the source fails to parse or contains no
2596/// `class_definition` nodes. The returned map is the per-file contribution
2597/// to the global hierarchy used by [`resolve_calls_with_python_mro_pub`]
2598/// for MRO-aware receiver-type dispatch (Q1, Wave 2).
2599#[must_use]
2600pub fn extract_python_class_hierarchy(source: &str) -> HashMap<String, Vec<String>> {
2601 let mut parser = Parser::new();
2602 let lang: tree_sitter::Language = tree_sitter_python::LANGUAGE.into();
2603 if parser.set_language(&lang).is_err() {
2604 return HashMap::new();
2605 }
2606 let Some(tree) = parser.parse(source, None) else {
2607 return HashMap::new();
2608 };
2609 let mut out: HashMap<String, Vec<String>> = HashMap::new();
2610 extract_python_class_hierarchy_node(source, tree.root_node(), &mut out);
2611 out
2612}
2613
2614/// Compute the linearised MRO (Method Resolution Order) for a Python class
2615/// name using a **simplified left-first depth-first walk** of the declared
2616/// `class → [parents]` hierarchy.
2617///
2618/// Python's real MRO uses C3 linearisation, which is monotonic and respects
2619/// declaration order across the diamond inheritance shape. For ripvec's
2620/// reverse-call-graph purpose we want *any plausible ancestor* of the
2621/// receiver type — including ancestors only reachable via a mixin — so we
2622/// can resolve `self.method()` calls whose dispatch lands on an ancestor.
2623/// The simplification: pre-order DFS, left-to-right, skipping cycles via a
2624/// `visited` set.
2625///
2626/// On a non-diamond shape this matches C3 exactly. On a diamond the
2627/// simplified walk may surface an ancestor earlier than C3 would, but every
2628/// ancestor C3 would visit is still reached — and ripvec's goal is "find
2629/// the implementing def for an inherited call", not "compute the runtime
2630/// dispatch winner". Over-approximating ancestors only matters when two
2631/// ancestors define the same method, and even then the left-first order
2632/// matches C3 on the common patterns
2633/// (`class Sub(Base, Mixin)` → `Sub, Base, Mixin, <Base's ancestors>, <Mixin's ancestors>`).
2634///
2635/// The returned list excludes the start class itself. Each entry appears at
2636/// most once even when reachable through multiple parent chains.
2637fn compute_python_mro<H: std::hash::BuildHasher>(
2638 start: &str,
2639 hierarchy: &HashMap<String, Vec<String>, H>,
2640) -> Vec<String> {
2641 use std::collections::HashSet;
2642 let mut order: Vec<String> = Vec::new();
2643 let mut visited: HashSet<String> = HashSet::new();
2644 // Start with the immediate parents of `start` (the receiver-type's own
2645 // scope was already searched by Priority 2's direct match).
2646 let Some(start_parents) = hierarchy.get(start) else {
2647 return order;
2648 };
2649 // DFS stack: we push in reverse so pop yields left-first order.
2650 let mut stack: Vec<String> = start_parents.iter().rev().cloned().collect();
2651 while let Some(cls) = stack.pop() {
2652 if !visited.insert(cls.clone()) {
2653 continue;
2654 }
2655 order.push(cls.clone());
2656 if let Some(parents) = hierarchy.get(&cls) {
2657 for p in parents.iter().rev() {
2658 if !visited.contains(p) {
2659 stack.push(p.clone());
2660 }
2661 }
2662 }
2663 }
2664 order
2665}
2666
2667// ── Go receiver-type heuristic ────────────────────────────────────────
2668
2669/// Walk the Go parse tree and fill `map` with receiver-type inference.
2670///
2671/// One heuristic case: **`recv.Method()` inside a `method_declaration`**.
2672///
2673/// Go methods have an explicit receiver parameter in their signature:
2674/// `func (r *Foo) Bar() { r.Baz() }` — `r` is bound to type `Foo`.
2675///
2676/// The Go call query captures:
2677/// `(call_expression function: (selector_expression field: (field_identifier) @callee))`
2678///
2679/// Within `selector_expression`, `operand` is the receiver expression and
2680/// `field` is the method name (the `@callee` capture).
2681///
2682/// This function also handles `self.method()` patterns for cases where code
2683/// uses `self` as a receiver name (not idiomatic Go, but it occurs).
2684fn collect_go_receiver_types(
2685 source: &str,
2686 root: tree_sitter::Node<'_>,
2687 map: &mut HashMap<u32, String>,
2688) {
2689 // Stack carries (node, receiver_binding: Option<(recv_name, recv_type)>).
2690 let mut stack: Vec<(tree_sitter::Node<'_>, Option<(String, String)>)> = vec![(root, None)];
2691
2692 while let Some((n, recv_binding)) = stack.pop() {
2693 if n.kind() == "method_declaration" {
2694 // Extract the receiver name and type from the method signature.
2695 let binding = extract_go_receiver_binding(source, n);
2696 let new_binding = binding.or_else(|| recv_binding.clone());
2697 let mut cursor = n.walk();
2698 for child in n.children(&mut cursor) {
2699 stack.push((child, new_binding.clone()));
2700 }
2701 } else {
2702 // For any call_expression whose function is a selector_expression,
2703 // check if the operand matches the active receiver binding.
2704 if n.kind() == "call_expression"
2705 && let Some(func) = n.child_by_field_name("function")
2706 && func.kind() == "selector_expression"
2707 && let (Some(operand), Some(field)) = (
2708 func.child_by_field_name("operand"),
2709 func.child_by_field_name("field"),
2710 )
2711 {
2712 let recv_text = source[operand.start_byte()..operand.end_byte()].to_string();
2713 let receiver_type = recv_binding.as_ref().and_then(|(recv_name, recv_ty)| {
2714 if recv_text == *recv_name {
2715 Some(recv_ty.clone())
2716 } else {
2717 None
2718 }
2719 });
2720
2721 if let Some(ty) = receiver_type {
2722 // The `@callee` capture byte offset is the `field` child.
2723 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
2724 let field_byte = field.start_byte() as u32;
2725 map.insert(field_byte, ty);
2726 }
2727 }
2728
2729 let mut cursor = n.walk();
2730 for child in n.children(&mut cursor) {
2731 stack.push((child, recv_binding.clone()));
2732 }
2733 }
2734 }
2735}
2736
2737/// Extract the receiver name and base type from a Go `method_declaration`.
2738///
2739/// Go method declaration shape (tree-sitter-go):
2740/// ```text
2741/// (method_declaration
2742/// receiver: (parameter_list
2743/// (parameter_declaration
2744/// name: (identifier) ← receiver name
2745/// type: (type_identifier ← receiver type (bare)
2746/// | pointer_type (type_identifier)) ← or *Type
2747/// )
2748/// )
2749/// name: (field_identifier)
2750/// ...
2751/// )
2752/// ```
2753///
2754/// Returns `Some((recv_name, type_name))` or `None` if the receiver is unnamed
2755/// (blank identifier `_`) or has an unrecognisable shape.
2756fn extract_go_receiver_binding(
2757 source: &str,
2758 method_node: tree_sitter::Node<'_>,
2759) -> Option<(String, String)> {
2760 let receiver_list = method_node.child_by_field_name("receiver")?;
2761 // parameter_list contains one parameter_declaration
2762 let mut cursor = receiver_list.walk();
2763 for param in receiver_list.children(&mut cursor) {
2764 if param.kind() == "parameter_declaration" {
2765 let name_node = param.child_by_field_name("name");
2766 let type_node = param.child_by_field_name("type");
2767 if let (Some(nn), Some(tn)) = (name_node, type_node) {
2768 let name = source[nn.start_byte()..nn.end_byte()].to_string();
2769 if name == "_" || name.is_empty() {
2770 return None;
2771 }
2772 let ty = extract_go_base_type(source, tn);
2773 if !ty.is_empty() {
2774 return Some((name, ty));
2775 }
2776 }
2777 }
2778 }
2779 None
2780}
2781
2782/// Extract the base type name from a Go type node.
2783///
2784/// For `Foo` (type_identifier) → `"Foo"`.
2785/// For `*Foo` (pointer_type → type_identifier) → `"Foo"`.
2786fn extract_go_base_type(source: &str, node: tree_sitter::Node<'_>) -> String {
2787 match node.kind() {
2788 "type_identifier" => source[node.start_byte()..node.end_byte()].to_string(),
2789 "pointer_type" => {
2790 // pointer_type has one child: the pointee type
2791 let mut cursor = node.walk();
2792 for child in node.children(&mut cursor) {
2793 if child.kind() == "type_identifier" {
2794 return source[child.start_byte()..child.end_byte()].to_string();
2795 }
2796 let t = extract_go_base_type(source, child);
2797 if !t.is_empty() {
2798 return t;
2799 }
2800 }
2801 String::new()
2802 }
2803 _ => {
2804 let mut cursor = node.walk();
2805 for child in node.children(&mut cursor) {
2806 if child.kind() == "type_identifier" {
2807 return source[child.start_byte()..child.end_byte()].to_string();
2808 }
2809 }
2810 String::new()
2811 }
2812 }
2813}
2814
2815/// Enrich Go `method_declaration` definition scopes with their receiver type name.
2816///
2817/// In the generic `extract_definitions` path, `build_scope_chain` walks the
2818/// *parent* chain of the `@def` node. For Go `method_declaration`, the parent
2819/// is the file root — so the scope is always `""`.
2820///
2821/// An empty scope means `resolve_calls` Priority 2 (receiver-type matching via
2822/// `scope.contains(recv_type)`) never fires for Go methods. Cross-file calls
2823/// where the caller inferred `receiver_type = Some("Foo")` stay unresolved;
2824/// no edge is recorded; `def_callers[]` stays empty for those defs — the root
2825/// cause of the missing inverse index for Go (I#P1).
2826///
2827/// Fix: after `extract_definitions`, parse the Go source a second time to find
2828/// each `method_declaration`'s receiver type, then set the matching def's scope
2829/// to `"method_declaration {ReceiverType}"`. This matches the pattern used by
2830/// the existing `go_resolve_receiver_method_via_signature` integration test,
2831/// which asserts that `scope.contains("Foo")` succeeds when the scope is
2832/// `"method_declaration Foo"`.
2833///
2834/// Matching is by `start_byte` (precise) so name collisions across different
2835/// receiver types are handled correctly.
2836fn enrich_go_method_def_scopes(source: &str, defs: &mut [Definition]) {
2837 let go_lang: tree_sitter::Language = tree_sitter_go::LANGUAGE.into();
2838 let mut parser = Parser::new();
2839 if parser.set_language(&go_lang).is_err() {
2840 return;
2841 }
2842 let Some(tree) = parser.parse(source, None) else {
2843 return;
2844 };
2845
2846 // Walk all top-level method_declaration nodes.
2847 let root = tree.root_node();
2848 let mut method_cursor = root.walk();
2849 for child in root.children(&mut method_cursor) {
2850 if child.kind() != "method_declaration" {
2851 continue;
2852 }
2853 let Some((_, recv_type)) = extract_go_receiver_binding(source, child) else {
2854 continue;
2855 };
2856 // Match by start_byte (precise): the @def node for method_declaration in
2857 // the Go definition query is the method_declaration node itself, so its
2858 // start_byte matches the def's start_byte recorded during extract_definitions.
2859 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
2860 let method_start_byte = child.start_byte() as u32;
2861 for def in defs.iter_mut() {
2862 if def.kind == "method_declaration" && def.start_byte == method_start_byte {
2863 def.scope = format!("method_declaration {recv_type}");
2864 break;
2865 }
2866 }
2867 }
2868}
2869
2870/// Public wrapper for `enrich_go_method_def_scopes` — enables integration tests
2871/// to call it directly without going through the full `build_graph` pipeline.
2872pub fn enrich_go_method_def_scopes_pub(source: &str, defs: &mut [Definition]) {
2873 enrich_go_method_def_scopes(source, defs);
2874}
2875
2876/// SQL: prepend a synthetic whole-file definition whose name is the filename stem.
2877///
2878/// dbt and sqlmesh follow a filename-as-model-name convention:
2879///
2880/// - `silver_issuer_returns.sql` defines the `silver_issuer_returns` model.
2881/// - `gold_issuer_returns.sql` references the silver model by filename stem,
2882/// not by any in-source CREATE TABLE.
2883///
2884/// In sqlmesh, the in-source name is templated:
2885///
2886/// ```sql
2887/// MODEL (
2888/// name @{athena_sqlmesh_silver_schema}.issuer_returns,
2889/// ...
2890/// );
2891/// SELECT ... FROM @{athena_sqlmesh_silver_schema}.stg_issuer_returns;
2892/// ```
2893///
2894/// The `MODEL (...)` header parses as an ERROR node under tree-sitter-sequel
2895/// because `@{var}` is not standard SQL; FROM/JOIN further down the file still
2896/// extract cleanly. Without a synthetic def, `lsp_workspace_symbols(query=
2897/// "silver_issuer_returns")` returns no hits — there is no real CREATE TABLE
2898/// in the file and the model name is interpolation only.
2899///
2900/// This helper prepends a definition with:
2901/// - `name` = filename stem (e.g., `silver_issuer_returns`)
2902/// - `kind` = `"sql_file"` (maps to `LSP SymbolKind::File` in
2903/// [`languages::lsp_symbol_kind_for_node_kind`])
2904/// - byte range = the entire source `[0, source.len())`
2905/// - scope / signature / qualified_name = empty / None
2906///
2907/// The whole-file byte range is the key to FROM/JOIN attribution: when
2908/// `extract_calls` later places a CallRef from a FROM clause that is not
2909/// inside any CTE or other smaller def, the smallest-enclosing-def search
2910/// lands on this synthetic file def and the edge is recorded.
2911///
2912/// If the filename has no stem (empty / `..`), the helper is a no-op.
2913/// Idempotent: if a `sql_file` def already exists at byte 0, it is left alone.
2914pub(crate) fn enrich_sql_file_def(filename: &str, source: &str, defs: &mut Vec<Definition>) {
2915 // Idempotency: do nothing if a sql_file def is already present at byte 0.
2916 if defs
2917 .iter()
2918 .any(|d| d.kind == "sql_file" && d.start_byte == 0)
2919 {
2920 return;
2921 }
2922
2923 // Derive the filename stem (last path component, file extension stripped).
2924 let stem = std::path::Path::new(filename)
2925 .file_stem()
2926 .and_then(|s| s.to_str())
2927 .unwrap_or_default();
2928 if stem.is_empty() {
2929 return;
2930 }
2931
2932 // Count newlines so end_line is reasonable for downstream UI.
2933 let end_line_zero_based = source.bytes().filter(|&b| b == b'\n').count();
2934 #[expect(clippy::cast_possible_truncation, reason = "line counts fit in u32")]
2935 let end_line = (end_line_zero_based as u32) + 1;
2936 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
2937 let end_byte = source.len() as u32;
2938
2939 let file_def = Definition {
2940 name: stem.to_string(),
2941 kind: "sql_file".to_string(),
2942 start_line: 1,
2943 end_line,
2944 scope: String::new(),
2945 signature: None,
2946 start_byte: 0,
2947 end_byte,
2948 calls: vec![],
2949 decorator: None,
2950 lsp_kind_hint: None,
2951 };
2952 // Prepend so it remains the outermost (largest) enclosing def at byte 0,
2953 // ensuring narrow CTE defs are still preferred for inner-FROM attribution
2954 // (the smallest-enclosing rule in `extract_calls`).
2955 defs.insert(0, file_def);
2956}
2957
2958/// Public wrapper for [`enrich_sql_file_def`] — enables integration tests
2959/// to call it directly without going through the full `build_graph` pipeline.
2960///
2961/// This is `pub` (not `pub(crate)`) because integration tests in
2962/// `crates/ripvec-core/tests/` are in a separate crate and cannot access
2963/// `pub(crate)` items.
2964pub fn enrich_sql_file_def_pub(filename: &str, source: &str, defs: &mut Vec<Definition>) {
2965 enrich_sql_file_def(filename, source, defs);
2966}
2967
2968/// Public wrapper for `extract_calls` — enables integration tests to call it
2969/// directly without going through the full `build_graph` pipeline.
2970///
2971/// This is `pub` (not `pub(crate)`) because integration tests in
2972/// `crates/ripvec-core/tests/` are in a separate crate and cannot access
2973/// `pub(crate)` items.
2974pub fn extract_calls_pub(
2975 source: &str,
2976 call_config: &languages::CallConfig,
2977 defs: &mut [Definition],
2978) {
2979 extract_calls(source, call_config, defs);
2980}
2981
2982/// Public wrapper for [`extract_definitions`] — enables integration tests in
2983/// `crates/ripvec-core/tests/` to drive Python decorator detection (C1, 4.1.1).
2984///
2985/// Returns the [`Definition`] list for `source` parsed as the language
2986/// identified by `lang_config`.
2987#[must_use]
2988pub fn extract_definitions_pub(
2989 source: &str,
2990 lang_config: &languages::LangConfig,
2991) -> Vec<Definition> {
2992 extract_definitions(source, lang_config)
2993}
2994
2995/// Build an index from definition name to list of `DefId`s.
2996#[must_use]
2997pub fn build_def_index_pub(files: &[FileNode]) -> HashMap<String, Vec<DefId>> {
2998 build_def_index(files)
2999}
3000
3001fn build_def_index(files: &[FileNode]) -> HashMap<String, Vec<DefId>> {
3002 let mut index: HashMap<String, Vec<DefId>> = HashMap::new();
3003 for (file_idx, file) in files.iter().enumerate() {
3004 for (def_idx, def) in file.defs.iter().enumerate() {
3005 #[expect(clippy::cast_possible_truncation)]
3006 let did: DefId = (file_idx as u32, def_idx as u16);
3007 index.entry(def.name.clone()).or_default().push(did);
3008 }
3009 }
3010 index
3011}
3012
3013/// Resolve call references to target definitions.
3014///
3015/// Resolution priority:
3016///
3017/// 1. **Qualified path** (`qualified_path = Some("mod_a::foo")`): filter candidates
3018/// by qualifier match (file path or scope contains the qualifier segment). Unique
3019/// match → resolve; ambiguous or no match → leave `None`.
3020/// 2. **Receiver type — direct scope match** (`receiver_type = Some("Foo")`):
3021/// for method calls, prefer candidates whose `scope` contains the receiver
3022/// type name (e.g., `"impl_item Foo"`). Among receiver-matching candidates,
3023/// further prefer those in imported files. Unique match → resolve;
3024/// ambiguous → leave `None`.
3025/// When this step finds nothing on the receiver class itself, sub-step 2b
3026/// (Python MRO walk) runs: when `Foo` has a recorded parent chain, walk
3027/// the receiver class's MRO (left-first DFS) and try the scope-match
3028/// against each ancestor's name. First ancestor with a matching candidate
3029/// wins. See [`compute_python_mro`] for the simplification rationale (it
3030/// diverges from C3 only on diamond shapes where two ancestors define the
3031/// same name).
3032/// 3. **Same file** (unqualified, no receiver): prefer definitions in the caller's
3033/// own file.
3034/// 4. **SQL suffix-match** (sql_file callers only, no exact-name match): when
3035/// the caller def has `kind = "sql_file"` and the bare `call_name` (e.g.,
3036/// `"issuer_returns"`) has no exact entry in the def index, scan all
3037/// `sql_file` defs for names ending with `_<call_name>` (e.g.,
3038/// `"silver_issuer_returns"`). This bridges dbt / sqlmesh layered schema
3039/// prefixes: `gold_issuer_returns.sql` uses `FROM @{schema}.issuer_returns`
3040/// which tree-sitter reduces to the bare name `"issuer_returns"`, while the
3041/// target def is the synthetic `sql_file` def named `"silver_issuer_returns"`.
3042/// Unique suffix-match → resolve; ambiguous (multiple layers match) or
3043/// no match → leave `None`. Non-sql_file callers are explicitly excluded.
3044/// 5. **Imported file** (unqualified, no receiver): check definitions in files this
3045/// file imports. Unique imported candidate → resolve.
3046/// 6. **Global-unique fallback**: when a bare call name maps to exactly one def in the
3047/// entire graph — regardless of file or import relationship — resolve to it.
3048/// Handles trait-method dispatch (`Trait::method` called as bare `method`) and
3049/// struct constructors referenced across non-imported module boundaries.
3050/// Only fires when Priorities 1–5 left the call unresolved and exactly one
3051/// candidate exists. Ambiguous (>1 candidates) → leave `None`.
3052/// 7. **Ambiguous or unresolved**: leave `resolved` as `None` (no silent first-wins).
3053///
3054/// Equivalent to [`resolve_calls_with_python_mro_pub`] with an empty MRO map
3055/// (Priority 2.5 is a no-op).
3056pub fn resolve_calls_pub<S: std::hash::BuildHasher>(
3057 files: &mut [FileNode],
3058 def_index: &HashMap<String, Vec<DefId>, S>,
3059) {
3060 let empty: HashMap<String, Vec<String>> = HashMap::new();
3061 resolve_calls_inner(files, def_index, &empty);
3062}
3063
3064/// Resolve call references with MRO-aware Python receiver dispatch enabled.
3065///
3066/// Identical to [`resolve_calls_pub`] except that Priority 2.5 (the MRO walk)
3067/// fires when the caller passes a non-empty `python_class_hierarchy`.
3068/// `build_graph` populates the hierarchy by parsing every Python source file
3069/// with [`extract_python_class_hierarchy`] and merging the per-file maps.
3070///
3071/// Tests that want to exercise MRO resolution without going through
3072/// `build_graph` can call this directly with a synthetic hierarchy.
3073pub fn resolve_calls_with_python_mro_pub<S, H>(
3074 files: &mut [FileNode],
3075 def_index: &HashMap<String, Vec<DefId>, S>,
3076 python_class_hierarchy: &HashMap<String, Vec<String>, H>,
3077) where
3078 S: std::hash::BuildHasher,
3079 H: std::hash::BuildHasher,
3080{
3081 resolve_calls_inner(files, def_index, python_class_hierarchy);
3082}
3083
3084fn resolve_calls<S, H>(
3085 files: &mut [FileNode],
3086 def_index: &HashMap<String, Vec<DefId>, S>,
3087 python_class_hierarchy: &HashMap<String, Vec<String>, H>,
3088) where
3089 S: std::hash::BuildHasher,
3090 H: std::hash::BuildHasher,
3091{
3092 resolve_calls_inner(files, def_index, python_class_hierarchy);
3093}
3094
3095#[expect(
3096 clippy::too_many_lines,
3097 reason = "8-priority resolution cascade (qualified path, receiver type, MRO walk, same-file, \
3098 SQL suffix-match, imported-file, global-unique, ambiguous); each priority is a \
3099 distinct decision branch and extracting helpers would require passing large shared \
3100 state across boundaries"
3101)]
3102fn resolve_calls_inner<S, H>(
3103 files: &mut [FileNode],
3104 def_index: &HashMap<String, Vec<DefId>, S>,
3105 python_class_hierarchy: &HashMap<String, Vec<String>, H>,
3106) where
3107 S: std::hash::BuildHasher,
3108 H: std::hash::BuildHasher,
3109{
3110 // Pre-compute imported file sets for each file.
3111 let imported_files: Vec<std::collections::HashSet<u32>> = files
3112 .iter()
3113 .map(|f| {
3114 f.imports
3115 .iter()
3116 .filter_map(|imp| imp.resolved_idx)
3117 .collect()
3118 })
3119 .collect();
3120
3121 for file_idx in 0..files.len() {
3122 for def_idx in 0..files[file_idx].defs.len() {
3123 for call_idx in 0..files[file_idx].defs[def_idx].calls.len() {
3124 let call_name = files[file_idx].defs[def_idx].calls[call_idx].name.clone();
3125 let qualified_path = files[file_idx].defs[def_idx].calls[call_idx]
3126 .qualified_path
3127 .clone();
3128 let receiver_type = files[file_idx].defs[def_idx].calls[call_idx]
3129 .receiver_type
3130 .clone();
3131
3132 // I#54a — HCL output-attribute resolution. When the extractor
3133 // emits `terraform_remote_state.<NAME>.outputs.<ATTR>` for a
3134 // consumer reference, bind it to the upstream module's
3135 // `output "<ATTR>" { ... }` def. The def's `name` is exactly
3136 // ATTR (per the HCL @name capture in `compile_config`), and
3137 // the def lives in some `.tf` / `.tfvars` / `.hcl` file in
3138 // the workspace. This is the layout-agnostic resolution path:
3139 // when the upstream tfstate lives in a sibling file
3140 // (`infra/foo.tf` rather than `infra/<NAME>/main.tf`), the
3141 // path-segment branch below cannot help. Unique candidate →
3142 // resolve; ambiguous (the same output name in multiple HCL
3143 // files) → leave None.
3144 if let Some(ref qpath) = qualified_path
3145 && qpath.starts_with("terraform_remote_state.")
3146 && qpath.contains(".outputs.")
3147 && !call_name.is_empty()
3148 {
3149 if let Some(candidates) = def_index.get(&call_name) {
3150 let hcl_matches: Vec<DefId> = candidates
3151 .iter()
3152 .copied()
3153 .filter(|&(f_idx, _)| {
3154 let path = std::path::Path::new(&files[f_idx as usize].path);
3155 path.extension().is_some_and(|ext| {
3156 ext.eq_ignore_ascii_case("tf")
3157 || ext.eq_ignore_ascii_case("tfvars")
3158 || ext.eq_ignore_ascii_case("hcl")
3159 })
3160 })
3161 .collect();
3162 if hcl_matches.len() == 1 {
3163 files[file_idx].defs[def_idx].calls[call_idx].resolved =
3164 Some(hcl_matches[0]);
3165 }
3166 }
3167 continue;
3168 }
3169
3170 // HCL: dedicated resolution for `terraform_remote_state.<NAME>`
3171 // and `module.<NAME>` qualified paths. Aurora's module DAG is
3172 // expressed by these patterns; resolve to the first def in any
3173 // file under a `/<NAME>/` directory segment. This is the
3174 // module-source contract (R2 + R3, Wave 3).
3175 if let Some(ref qpath) = qualified_path
3176 && (qpath.starts_with("terraform_remote_state.")
3177 || qpath.starts_with("module."))
3178 {
3179 let target = &call_name; // already the bare module label
3180 let segment_match = format!("/{target}/");
3181 let alt_segment_prefix = format!("{target}/"); // when path starts with target dir
3182 let candidate = files.iter().enumerate().find_map(|(idx, f)| {
3183 if f.path.contains(&segment_match)
3184 || f.path.starts_with(&alt_segment_prefix)
3185 {
3186 // Pick the first def in the file (or skip if file
3187 // has no defs).
3188 if !f.defs.is_empty() {
3189 #[expect(
3190 clippy::cast_possible_truncation,
3191 reason = "file index fits in u32"
3192 )]
3193 {
3194 return Some((idx as u32, 0u16));
3195 }
3196 }
3197 }
3198 None
3199 });
3200 if let Some(did) = candidate {
3201 files[file_idx].defs[def_idx].calls[call_idx].resolved = Some(did);
3202 }
3203 continue;
3204 }
3205
3206 // ── Priority 1: Qualified-path resolution ────────────────
3207 //
3208 // `qualified_path` carries the full scoped path (e.g. "mod_a::foo").
3209 // We look up candidates by the bare `call_name`, then filter by
3210 // whether the file path or scope contains the qualifier prefix.
3211 if let Some(ref qpath) = qualified_path {
3212 // Qualifier is everything before the final `::`.
3213 let qualifier = if let Some(pos) = qpath.rfind("::") {
3214 &qpath[..pos]
3215 } else {
3216 qpath.as_str()
3217 };
3218 let qual_segments: Vec<&str> = qualifier.split("::").collect();
3219
3220 let Some(candidates) = def_index.get(&call_name) else {
3221 continue;
3222 };
3223
3224 let matching: Vec<DefId> = candidates
3225 .iter()
3226 .copied()
3227 .filter(|&(f_idx, _)| {
3228 let file_path = &files[f_idx as usize].path;
3229 let last_segment = qual_segments.last().copied().unwrap_or("");
3230 let path_as_module =
3231 file_path.trim_end_matches(".rs").replace(['/', '\\'], "::");
3232 path_as_module.contains(last_segment)
3233 || file_path.contains(last_segment)
3234 })
3235 .collect();
3236
3237 if matching.len() == 1 {
3238 files[file_idx].defs[def_idx].calls[call_idx].resolved = Some(matching[0]);
3239 }
3240 // Ambiguous or no match → leave None.
3241 continue;
3242 }
3243
3244 // ── Priority 3.5: SQL suffix-match resolution ────────────
3245 //
3246 // dbt / sqlmesh pipelines use layered schema prefixes:
3247 // `silver_issuer_returns.sql` defines the silver-layer model.
3248 // `gold_issuer_returns.sql` references it via a FROM clause
3249 // that tree-sitter parses as `name: "issuer_returns"` (the
3250 // `name:` field-selector strips the `@{schema}.` prefix).
3251 // `def_index.get("issuer_returns")` returns None — the def is
3252 // stored under "silver_issuer_returns".
3253 //
3254 // When: (a) no exact-name candidate exists, AND (b) the
3255 // caller's enclosing def is a sql_file (whole-file synthetic
3256 // def emitted by `enrich_sql_file_def`), walk every sql_file
3257 // def in the graph and check whether its name ends with
3258 // `_<call_name>`. Unique suffix-match → resolve. Ambiguous
3259 // (e.g., both gold_ and silver_ match the same bare name) →
3260 // leave None (no silent first-wins).
3261 //
3262 // Non-sql_file callers are explicitly excluded: a Rust
3263 // function_item whose call_name happens to end with a suffix
3264 // of some sql_file def must NOT be resolved via this path.
3265 if !def_index.contains_key(&call_name)
3266 && files[file_idx].defs[def_idx].kind == "sql_file"
3267 && !call_name.is_empty()
3268 {
3269 let suffix = format!("_{call_name}");
3270 let suffix_str = suffix.as_str();
3271 // Exclude the caller def itself from the suffix scan:
3272 // `gold_issuer_returns` also ends with `_issuer_returns`
3273 // but must not self-resolve.
3274 #[expect(clippy::cast_possible_truncation, reason = "file index fits in u32")]
3275 let caller_did: DefId = (file_idx as u32, def_idx as u16);
3276 let suffix_matches: Vec<DefId> = files
3277 .iter()
3278 .enumerate()
3279 .flat_map(|(f_idx, f)| {
3280 f.defs.iter().enumerate().filter_map(move |(d_idx, d)| {
3281 #[expect(
3282 clippy::cast_possible_truncation,
3283 reason = "file and def indices fit in u32/u16"
3284 )]
3285 let did: DefId = (f_idx as u32, d_idx as u16);
3286 if d.kind == "sql_file"
3287 && d.name.ends_with(suffix_str)
3288 && did != caller_did
3289 {
3290 Some(did)
3291 } else {
3292 None
3293 }
3294 })
3295 })
3296 .collect();
3297 if suffix_matches.len() == 1 {
3298 files[file_idx].defs[def_idx].calls[call_idx].resolved =
3299 Some(suffix_matches[0]);
3300 }
3301 // Ambiguous (>1) or no match (0) → leave None.
3302 continue;
3303 }
3304
3305 let Some(candidates) = def_index.get(&call_name) else {
3306 continue;
3307 };
3308
3309 // ── Priority 2: Receiver-type resolution ─────────────────
3310 //
3311 // `receiver_type = Some("Foo")` means this is a method call on a
3312 // value whose type is `Foo`. Filter candidates to those whose scope
3313 // chain contains the receiver type name.
3314 if let Some(ref rtype) = receiver_type {
3315 // Candidates whose scope contains the receiver type name.
3316 let receiver_matching: Vec<DefId> = candidates
3317 .iter()
3318 .copied()
3319 .filter(|&(f_idx, d_idx)| {
3320 let scope = &files[f_idx as usize].defs[d_idx as usize].scope;
3321 scope.contains(rtype.as_str())
3322 })
3323 .collect();
3324
3325 if receiver_matching.len() == 1 {
3326 files[file_idx].defs[def_idx].calls[call_idx].resolved =
3327 Some(receiver_matching[0]);
3328 continue;
3329 }
3330
3331 if receiver_matching.len() > 1 {
3332 // Among receiver-matching candidates, prefer those in imported files.
3333 let imported_receiver_matching: Vec<DefId> = receiver_matching
3334 .iter()
3335 .copied()
3336 .filter(|(f, _)| imported_files[file_idx].contains(f))
3337 .collect();
3338 if imported_receiver_matching.len() == 1 {
3339 files[file_idx].defs[def_idx].calls[call_idx].resolved =
3340 Some(imported_receiver_matching[0]);
3341 }
3342 // Ambiguous even after import filter → leave None.
3343 continue;
3344 }
3345
3346 // ── Priority 2.5: Python MRO walk ─────────────────────
3347 //
3348 // The receiver type's own scope has no matching def — but
3349 // the method may live on an ancestor class. Walk the MRO
3350 // (left-first DFS) and try the scope-match against each
3351 // ancestor's name. First ancestor with at least one
3352 // scope-matching candidate wins; if multiple candidates
3353 // match for the same ancestor, prefer imported files,
3354 // else take the first in stable order.
3355 //
3356 // Liskov: a subclass's `self.method()` call must dispatch
3357 // through the MRO; over-approximating ancestors is the
3358 // correct conservative move for a reverse call graph.
3359 // For non-Python languages or Python receivers with no
3360 // recorded parents, `compute_python_mro` returns an
3361 // empty vector and this loop is a no-op.
3362 let mro = compute_python_mro(rtype, python_class_hierarchy);
3363 let mut resolved_via_mro: Option<DefId> = None;
3364 for ancestor in &mro {
3365 let ancestor_matching: Vec<DefId> = candidates
3366 .iter()
3367 .copied()
3368 .filter(|&(f_idx, d_idx)| {
3369 let scope = &files[f_idx as usize].defs[d_idx as usize].scope;
3370 scope.contains(ancestor.as_str())
3371 })
3372 .collect();
3373 if ancestor_matching.len() == 1 {
3374 resolved_via_mro = Some(ancestor_matching[0]);
3375 break;
3376 }
3377 if ancestor_matching.len() > 1 {
3378 // Prefer imported files among the ancestor matches.
3379 let imported_ancestor: Vec<DefId> = ancestor_matching
3380 .iter()
3381 .copied()
3382 .filter(|(f, _)| imported_files[file_idx].contains(f))
3383 .collect();
3384 if imported_ancestor.len() == 1 {
3385 resolved_via_mro = Some(imported_ancestor[0]);
3386 break;
3387 }
3388 // Ambiguous at this ancestor — pick the first
3389 // candidate in stable order. The MRO walk's job
3390 // is to find *an* implementing def for an
3391 // inherited call, not to compute the runtime
3392 // dispatch winner; any plausible candidate is
3393 // useful for the reverse call graph.
3394 resolved_via_mro = Some(ancestor_matching[0]);
3395 break;
3396 }
3397 }
3398 if let Some(did) = resolved_via_mro {
3399 files[file_idx].defs[def_idx].calls[call_idx].resolved = Some(did);
3400 continue;
3401 }
3402 // No receiver-matching candidates anywhere in the MRO →
3403 // fall through to bare-name resolution.
3404 }
3405
3406 // ── Priority 3: Same-file resolution ─────────────────────
3407 #[expect(clippy::cast_possible_truncation)]
3408 let file_idx_u32 = file_idx as u32;
3409 if let Some(&did) = candidates.iter().find(|(f, _)| *f == file_idx_u32) {
3410 files[file_idx].defs[def_idx].calls[call_idx].resolved = Some(did);
3411 continue;
3412 }
3413
3414 // ── Priority 4: Imported-file resolution ──────────────────
3415 let imported_candidates: Vec<DefId> = candidates
3416 .iter()
3417 .copied()
3418 .filter(|(f, _)| imported_files[file_idx].contains(f))
3419 .collect();
3420 if imported_candidates.len() == 1 {
3421 files[file_idx].defs[def_idx].calls[call_idx].resolved =
3422 Some(imported_candidates[0]);
3423 }
3424
3425 // ── Priority 5.5 (I#54b): SQL file-level resolution ──────
3426 //
3427 // When the caller's enclosing def is a `sql_file` (the
3428 // synthetic whole-file def emitted by `enrich_sql_file_def`)
3429 // and at least one candidate is also a `sql_file` in a
3430 // different file, prefer that candidate. dbt/sqlmesh models
3431 // are named by filename; a `FROM upstream_table` reference
3432 // means "the model defined in `upstream_table.sql`", which
3433 // is exactly the sql_file def named `upstream_table`.
3434 //
3435 // Without this branch the bare-name lookup is ambiguous on
3436 // the global-unique step whenever the upstream file also
3437 // contains a `CREATE TABLE upstream_table` (two co-located
3438 // candidates: `create_table` and `sql_file` — same logical
3439 // entity but two distinct `Definition` records). The
3440 // sql_file is the correct target for cross-file edges
3441 // because it carries `calls[]` for further chaining.
3442 //
3443 // Only fires for sql_file callers — Rust / Python /
3444 // JavaScript bare-name calls that happen to match a
3445 // sql_file def must NOT be resolved through this path.
3446 if files[file_idx].defs[def_idx].kind == "sql_file"
3447 && files[file_idx].defs[def_idx].calls[call_idx]
3448 .resolved
3449 .is_none()
3450 {
3451 #[expect(clippy::cast_possible_truncation, reason = "file index fits in u32")]
3452 let caller_file_u32 = file_idx as u32;
3453 let sql_file_targets: Vec<DefId> = candidates
3454 .iter()
3455 .copied()
3456 .filter(|&(f_idx, d_idx)| {
3457 f_idx != caller_file_u32
3458 && files[f_idx as usize].defs[d_idx as usize].kind == "sql_file"
3459 })
3460 .collect();
3461 if sql_file_targets.len() == 1 {
3462 files[file_idx].defs[def_idx].calls[call_idx].resolved =
3463 Some(sql_file_targets[0]);
3464 continue;
3465 }
3466 }
3467
3468 // ── Priority 6: Global-unique fallback ────────────────────────────
3469 //
3470 // When a bare call name maps to exactly one def in the entire graph
3471 // — regardless of file or import relationship — resolve to it. This
3472 // bridges trait-method dispatch (`Trait::method` called as bare
3473 // `method`) and constructors referenced across non-imported module
3474 // boundaries.
3475 //
3476 // Guard with `.is_none()` so we do not overwrite a Priority 5
3477 // (imported-file) resolution that already fired.
3478 if candidates.len() == 1
3479 && files[file_idx].defs[def_idx].calls[call_idx]
3480 .resolved
3481 .is_none()
3482 {
3483 files[file_idx].defs[def_idx].calls[call_idx].resolved = Some(candidates[0]);
3484 }
3485 // Priority 7: Ambiguous or unresolved → leave None.
3486 }
3487 }
3488 }
3489}
3490
3491/// Compute a prefix-sum offset table for flattening `DefId`s to linear indices.
3492fn def_offsets(files: &[FileNode]) -> Vec<usize> {
3493 let mut offsets = Vec::with_capacity(files.len() + 1);
3494 offsets.push(0);
3495 for file in files {
3496 offsets.push(offsets.last().unwrap() + file.defs.len());
3497 }
3498 offsets
3499}
3500
3501/// Flatten a `DefId` to a linear index using the offset table.
3502fn flatten_def_id(offsets: &[usize], did: DefId) -> usize {
3503 offsets[did.0 as usize] + did.1 as usize
3504}
3505
3506/// Build top-N caller and callee lists for each definition (flattened).
3507fn build_def_neighbor_lists(
3508 n: usize,
3509 edges: &[(u32, u32, u32)],
3510 offsets: &[usize],
3511) -> (Vec<Vec<DefId>>, Vec<Vec<DefId>>) {
3512 let mut incoming: Vec<Vec<(u32, u32)>> = vec![vec![]; n];
3513 let mut outgoing: Vec<Vec<(u32, u32)>> = vec![vec![]; n];
3514
3515 for &(src, dst, w) in edges {
3516 let (s, d) = (src as usize, dst as usize);
3517 if s < n && d < n {
3518 incoming[d].push((src, w));
3519 outgoing[s].push((dst, w));
3520 }
3521 }
3522
3523 // Convert flat index back to DefId
3524 let to_def_id = |flat: u32| -> DefId {
3525 let flat_usize = flat as usize;
3526 let file_idx = offsets.partition_point(|&o| o <= flat_usize) - 1;
3527 let def_idx = flat_usize - offsets[file_idx];
3528 #[expect(clippy::cast_possible_truncation)]
3529 (file_idx as u32, def_idx as u16)
3530 };
3531
3532 let callers = incoming
3533 .into_iter()
3534 .map(|mut v| {
3535 v.sort_by_key(|b| std::cmp::Reverse(b.1));
3536 v.truncate(MAX_NEIGHBORS);
3537 v.into_iter().map(|(idx, _)| to_def_id(idx)).collect()
3538 })
3539 .collect();
3540
3541 let callees = outgoing
3542 .into_iter()
3543 .map(|mut v| {
3544 v.sort_by_key(|b| std::cmp::Reverse(b.1));
3545 v.truncate(MAX_NEIGHBORS);
3546 v.into_iter().map(|(idx, _)| to_def_id(idx)).collect()
3547 })
3548 .collect();
3549
3550 (callers, callees)
3551}
3552
3553// ── PageRank ─────────────────────────────────────────────────────────
3554
3555/// Compute `PageRank` scores for a graph.
3556///
3557/// If `focus` is `Some(idx)`, computes topic-sensitive `PageRank` biased
3558/// toward file `idx`. Otherwise computes standard (uniform) `PageRank`.
3559///
3560/// Returns one score per node, summing to 1.0.
3561#[expect(
3562 clippy::cast_precision_loss,
3563 reason = "node count fits comfortably in f32"
3564)]
3565fn pagerank(n: usize, edges: &[(u32, u32, u32)], focus: Option<usize>) -> Vec<f32> {
3566 if n == 0 {
3567 return vec![];
3568 }
3569
3570 // Build adjacency: out_edges[src] = [(dst, weight)]
3571 let mut out_edges: Vec<Vec<(usize, f32)>> = vec![vec![]; n];
3572 let mut out_weight: Vec<f32> = vec![0.0; n];
3573
3574 for &(src, dst, w) in edges {
3575 let (s, d) = (src as usize, dst as usize);
3576 if s < n && d < n {
3577 #[expect(clippy::cast_possible_truncation, reason = "edge weights are small")]
3578 let wf = f64::from(w) as f32;
3579 out_edges[s].push((d, wf));
3580 out_weight[s] += wf;
3581 }
3582 }
3583
3584 // Personalization vector (Haveliwala 2002, topic-sensitive PageRank).
3585 //
3586 // When a focus file is specified, the teleportation distribution is split:
3587 // - PERSONALIZATION_ALPHA (0.15) concentrated on the focus node.
3588 // - (1 - PERSONALIZATION_ALPHA) = 0.85 spread uniformly over the
3589 // remaining (n - 1) other nodes.
3590 //
3591 // This gives the focus file a gentle bias over its neighbors without
3592 // collapsing every other file to an equal uniform floor. The resulting
3593 // ranks still vary across the corpus, so the caller sees a *neighborhood*
3594 // of semantically related files rebiased toward the focus (I#16 fix).
3595 //
3596 // For n == 1 there are no other nodes; the focus gets all mass (= 1.0).
3597 let bias: Vec<f32> = if let Some(idx) = focus {
3598 if n == 1 {
3599 vec![1.0_f32]
3600 } else {
3601 let other_mass = (1.0_f32 - PERSONALIZATION_ALPHA) / (n as f32 - 1.0);
3602 let mut b = vec![other_mass; n];
3603 if idx < n {
3604 b[idx] = PERSONALIZATION_ALPHA;
3605 }
3606 // Verify sum ≈ 1.0 (should hold by construction; normalization
3607 // guards against floating-point drift on very large graphs).
3608 let sum: f32 = b.iter().sum();
3609 for v in &mut b {
3610 *v /= sum;
3611 }
3612 b
3613 }
3614 } else {
3615 vec![1.0 / n as f32; n]
3616 };
3617
3618 let mut rank = vec![1.0 / n as f32; n];
3619 let mut next_rank = vec![0.0_f32; n];
3620
3621 for _ in 0..MAX_ITERATIONS {
3622 // Collect dangling mass (nodes with no outgoing edges)
3623 let dangling: f32 = rank
3624 .iter()
3625 .enumerate()
3626 .filter(|&(i, _)| out_edges[i].is_empty())
3627 .map(|(_, &r)| r)
3628 .sum();
3629
3630 // Distribute rank
3631 for (i, nr) in next_rank.iter_mut().enumerate() {
3632 *nr = (1.0 - DAMPING).mul_add(bias[i], DAMPING * dangling * bias[i]);
3633 }
3634
3635 for (src, edges_list) in out_edges.iter().enumerate() {
3636 if edges_list.is_empty() {
3637 continue;
3638 }
3639 let src_rank = rank[src];
3640 let total_w = out_weight[src];
3641 for &(dst, w) in edges_list {
3642 next_rank[dst] += DAMPING * src_rank * (w / total_w);
3643 }
3644 }
3645
3646 // Check convergence
3647 let diff: f32 = rank
3648 .iter()
3649 .zip(next_rank.iter())
3650 .map(|(a, b)| (a - b).abs())
3651 .sum();
3652
3653 std::mem::swap(&mut rank, &mut next_rank);
3654
3655 if diff < EPSILON {
3656 break;
3657 }
3658 }
3659
3660 rank
3661}
3662
3663// ── Graph Building ───────────────────────────────────────────────────
3664
3665/// Intermediate result from definition-level graph computation.
3666struct DefGraphData {
3667 def_edges: Vec<(DefId, DefId, u32)>,
3668 def_ranks: Vec<f32>,
3669 def_callers: Vec<Vec<DefId>>,
3670 def_callees: Vec<Vec<DefId>>,
3671 offsets: Vec<usize>,
3672 base_ranks: Vec<f32>,
3673 file_edges: Vec<(u32, u32, u32)>,
3674}
3675
3676/// Build bidirectional trait↔impl method edges for PageRank propagation (G3).
3677///
3678/// For every impl method that overrides a trait method, adds:
3679/// - `(impl_def_id, trait_def_id, 1)` — impl → trait
3680/// - `(trait_def_id, impl_def_id, 1)` — trait → impl
3681///
3682/// Detection heuristic: an impl method "overrides" a trait method when:
3683/// - The impl method's kind is `"function_item"` and its `scope` starts with
3684/// `"impl_item"`.
3685/// - The trait method's kind is `"function_signature_item"` and its `scope`
3686/// starts with `"trait_item"`.
3687/// - Both have the same `name`.
3688/// - The impl's file imports the trait's file (or they share a file).
3689///
3690/// This is heuristic, not sound: it may produce false positives when two
3691/// unrelated traits define methods with the same name. The practical false-
3692/// positive rate on real Rust codebases is low because method names are
3693/// usually unique within a crate.
3694#[must_use]
3695pub fn build_trait_impl_edges_pub(files: &[FileNode]) -> Vec<(DefId, DefId, u32)> {
3696 build_trait_impl_edges(files)
3697}
3698
3699fn build_trait_impl_edges(files: &[FileNode]) -> Vec<(DefId, DefId, u32)> {
3700 // Build index: method_name → list of (DefId, is_trait_method).
3701 // trait method: kind == "function_signature_item" (abstract) OR scope contains "trait_item".
3702 // impl method: kind == "function_item" AND scope contains "impl_item".
3703 let mut trait_methods: HashMap<String, Vec<DefId>> = HashMap::new();
3704 let mut impl_methods: HashMap<String, Vec<DefId>> = HashMap::new();
3705
3706 for (fi, file) in files.iter().enumerate() {
3707 for (di, def) in file.defs.iter().enumerate() {
3708 #[expect(clippy::cast_possible_truncation)]
3709 let did: DefId = (fi as u32, di as u16);
3710 if def.kind == "function_signature_item"
3711 || (def.scope.starts_with("trait_item") && def.kind == "function_item")
3712 {
3713 trait_methods.entry(def.name.clone()).or_default().push(did);
3714 } else if def.kind == "function_item" && def.scope.starts_with("impl_item") {
3715 impl_methods.entry(def.name.clone()).or_default().push(did);
3716 }
3717 }
3718 }
3719
3720 // Pre-build imported-files sets to restrict matching.
3721 let imported_sets: Vec<std::collections::HashSet<u32>> = files
3722 .iter()
3723 .map(|f| {
3724 f.imports
3725 .iter()
3726 .filter_map(|imp| imp.resolved_idx)
3727 .collect()
3728 })
3729 .collect();
3730
3731 let mut edges: Vec<(DefId, DefId, u32)> = Vec::new();
3732
3733 for (name, trait_defs) in &trait_methods {
3734 let Some(impl_defs) = impl_methods.get(name) else {
3735 continue;
3736 };
3737 for &(tf, td) in trait_defs {
3738 for &(imf, imd) in impl_defs {
3739 // The impl file must import the trait file (or be the same file).
3740 let connected = tf == imf
3741 || imported_sets
3742 .get(imf as usize)
3743 .is_some_and(|s| s.contains(&tf));
3744 if connected {
3745 let trait_id: DefId = (tf, td);
3746 let impl_id: DefId = (imf, imd);
3747 edges.push((trait_id, impl_id, 1));
3748 edges.push((impl_id, trait_id, 1));
3749 }
3750 }
3751 }
3752 }
3753
3754 edges
3755}
3756
3757/// Build definition-level edges, compute `PageRank`, and derive file-level data.
3758fn compute_def_graph(files: &[FileNode]) -> DefGraphData {
3759 // Build definition-level edge list from resolved calls
3760 let mut def_edge_map: HashMap<(DefId, DefId), u32> = HashMap::new();
3761 for (file_idx, file) in files.iter().enumerate() {
3762 for (def_idx, def) in file.defs.iter().enumerate() {
3763 #[expect(clippy::cast_possible_truncation)]
3764 let caller_id: DefId = (file_idx as u32, def_idx as u16);
3765 for call in &def.calls {
3766 if let Some(callee_id) = call.resolved {
3767 *def_edge_map.entry((caller_id, callee_id)).or_insert(0) += 1;
3768 }
3769 }
3770 }
3771 }
3772
3773 // Add trait↔impl bidirectional edges (G3).
3774 let trait_impl_edges = build_trait_impl_edges(files);
3775 for (src, dst, w) in trait_impl_edges {
3776 *def_edge_map.entry((src, dst)).or_insert(0) += w;
3777 }
3778
3779 let def_edges: Vec<(DefId, DefId, u32)> = def_edge_map
3780 .into_iter()
3781 .map(|((src, dst), w)| (src, dst, w))
3782 .collect();
3783
3784 // Compute def-level PageRank
3785 let offsets = def_offsets(files);
3786 let n_defs = *offsets.last().unwrap_or(&0);
3787
3788 let flat_def_edges: Vec<(u32, u32, u32)> = def_edges
3789 .iter()
3790 .map(|(src, dst, w)| {
3791 #[expect(clippy::cast_possible_truncation)]
3792 (
3793 flatten_def_id(&offsets, *src) as u32,
3794 flatten_def_id(&offsets, *dst) as u32,
3795 *w,
3796 )
3797 })
3798 .collect();
3799
3800 let def_ranks = pagerank(n_defs, &flat_def_edges, None);
3801
3802 // Derive file-level edges from def-level call edges. A cross-file def→def
3803 // edge contributes one file→file edge in the same direction; intra-file
3804 // edges are skipped (they cannot move rank between files).
3805 let mut file_edge_map: HashMap<(u32, u32), u32> = HashMap::new();
3806 for &(src, dst, w) in &def_edges {
3807 let src_file = src.0;
3808 let dst_file = dst.0;
3809 if src_file != dst_file {
3810 *file_edge_map.entry((src_file, dst_file)).or_insert(0) += w;
3811 }
3812 }
3813 let file_edges: Vec<(u32, u32, u32)> = file_edge_map
3814 .into_iter()
3815 .map(|((src, dst), w)| (src, dst, w))
3816 .collect();
3817
3818 // File-level rank: run PageRank directly on the file-level edge graph
3819 // (B-0025 fix, Cycle 11). The earlier formulation aggregated file rank
3820 // by summing per-def ranks of every def in the file, which amplified
3821 // the teleportation floor for files with high def cardinality but zero
3822 // in-degree (test files in particular). Running PageRank on
3823 // `file_edges` makes file rank obey the user-visible invariant: rank
3824 // flows from caller-file to callee-file, never from callee to caller.
3825 //
3826 // A file with no incoming file→file edges receives only the
3827 // teleportation floor `(1 - DAMPING) / n_files`, regardless of how
3828 // many defs it contains. Test files (which call hubs but are called
3829 // by no one) now correctly drop to the floor.
3830 let n_files = files.len();
3831 let base_ranks: Vec<f32> = if n_files == 0 {
3832 Vec::new()
3833 } else {
3834 pagerank(n_files, &file_edges, None)
3835 };
3836
3837 // Build def-level caller/callee lists
3838 let (def_callers, def_callees) = build_def_neighbor_lists(n_defs, &flat_def_edges, &offsets);
3839
3840 DefGraphData {
3841 def_edges,
3842 def_ranks,
3843 def_callers,
3844 def_callees,
3845 offsets,
3846 base_ranks,
3847 file_edges,
3848 }
3849}
3850
3851/// Build a dependency graph from a repository root.
3852///
3853/// Walks the directory tree, parses each supported file with tree-sitter,
3854/// extracts definitions and imports, resolves import paths to files, runs
3855/// `PageRank`, and builds caller/callee lists.
3856///
3857/// # Errors
3858///
3859/// Returns an error if file walking or reading fails.
3860#[expect(
3861 clippy::too_many_lines,
3862 reason = "three-phase parallel pipeline (walk+filter, def+import extraction, call extraction) \
3863 plus resolve + graph build; phases share state (file_index, raw_sources) and \
3864 cannot be meaningfully split without passing large mutable structures across \
3865 boundaries with no clarity gain"
3866)]
3867pub fn build_graph(root: &Path) -> crate::Result<RepoGraph> {
3868 let root = root.canonicalize().map_err(|e| crate::Error::Io {
3869 path: root.display().to_string(),
3870 source: e,
3871 })?;
3872
3873 let mut walk_options = walk::WalkOptions::default();
3874 if let Some((_, config)) = crate::cache::config::find_config(&root) {
3875 walk_options.ignore_patterns = config.ignore.patterns;
3876 }
3877 let all_files = walk::collect_files_with_options(&root, &walk_options);
3878
3879 // Phase 1: parallel filter + read. For each candidate path with a
3880 // supported extension, read its source from disk and emit a tuple
3881 // alongside its relative path. rayon spreads the I/O cost across
3882 // worker threads; on a 1M-file corpus this was ~20s sequential and
3883 // now sits in the 2-3s range bounded by disk + filter throughput.
3884 let raw_inputs: Vec<(PathBuf, String, String, String)> = all_files
3885 .par_iter()
3886 .filter_map(|path| {
3887 let ext = path
3888 .extension()
3889 .and_then(|e| e.to_str())
3890 .unwrap_or_default()
3891 .to_string();
3892 if languages::config_for_extension(&ext).is_none()
3893 && import_query_for_extension(&ext).is_none()
3894 {
3895 return None;
3896 }
3897 let source = std::fs::read_to_string(path).ok()?;
3898 let rel_path = path
3899 .strip_prefix(&root)
3900 .unwrap_or(path)
3901 .display()
3902 .to_string();
3903 Some((path.clone(), rel_path, ext, source))
3904 })
3905 .collect();
3906
3907 // Build the contiguous `files` Vec and the absolute-path -> idx
3908 // lookup. Sequential because both want stable indices that match
3909 // `raw_sources`'s order; the per-file work this gates is trivial.
3910 let mut file_index: HashMap<PathBuf, usize> = HashMap::with_capacity(raw_inputs.len());
3911 let mut files: Vec<FileNode> = Vec::with_capacity(raw_inputs.len());
3912 let mut raw_sources: Vec<(usize, String, String)> = Vec::with_capacity(raw_inputs.len());
3913 for (idx, (abs_path, rel_path, ext, source)) in raw_inputs.into_iter().enumerate() {
3914 file_index.insert(abs_path, idx);
3915 files.push(FileNode {
3916 path: rel_path,
3917 defs: vec![],
3918 imports: vec![],
3919 });
3920 raw_sources.push((idx, ext, source));
3921 }
3922
3923 // Phase 2: parallel per-file definition + import extraction. Each
3924 // file's tree-sitter parse + def/import queries are independent;
3925 // par_iter_mut over files.iter_mut().zip(raw_sources.par_iter())
3926 // lets every rayon worker grind its own slice. The closures here
3927 // borrow `&root` and `&file_index` immutably (both Sync) and write
3928 // disjoint `FileNode` slots via the &mut iterator.
3929 files
3930 .par_iter_mut()
3931 .zip(raw_sources.par_iter())
3932 .for_each(|(file, (_, ext, source))| {
3933 if let Some(config) = languages::config_for_extension(ext) {
3934 file.defs = extract_definitions(source, &config);
3935 // Go method_declaration scopes are empty after the generic
3936 // extract_definitions pass (the method is a top-level node
3937 // with no structural parent in CONTAINER_KINDS). Enrich them
3938 // with the receiver type so that resolve_calls Priority 2
3939 // (scope.contains(recv_type)) fires correctly for cross-file
3940 // Go receiver-method calls. This populates def_callers[] for
3941 // Go in compute_def_graph (P1 fix).
3942 if languages::is_go_language(&config.language) {
3943 enrich_go_method_def_scopes(source, &mut file.defs);
3944 }
3945 // SQL: prepend a synthetic file-level def named after the
3946 // filename stem (dbt/sqlmesh convention). The whole-file
3947 // byte range becomes the smallest-enclosing fallback for
3948 // FROM/JOIN call-edges that are not inside any CTE, which
3949 // is the resolution target for cross-model references
3950 // (S1, Wave 4). file.path is relative to the repo root and
3951 // is what file_stem() needs to derive the model name.
3952 if languages::is_sql_language(&config.language) {
3953 enrich_sql_file_def(&file.path, source, &mut file.defs);
3954 }
3955 }
3956 if let Some((lang, import_query)) = import_query_for_extension(ext) {
3957 let raw_imports = extract_imports(source, &lang, &import_query);
3958 let file_path = root.join(&file.path);
3959 file.imports = raw_imports
3960 .into_iter()
3961 .map(|raw| {
3962 let resolved_idx =
3963 resolve_import(&raw, ext, &file_path, &root, &file_index)
3964 .and_then(|i| u32::try_from(i).ok());
3965 ImportRef {
3966 raw_path: raw,
3967 resolved_idx,
3968 }
3969 })
3970 .collect();
3971 }
3972 });
3973
3974 // Phase 3: parallel per-file call extraction. Mutates each
3975 // FileNode's `defs[*].calls` independently. Aligned with
3976 // raw_sources by index via the zip.
3977 files
3978 .par_iter_mut()
3979 .zip(raw_sources.par_iter())
3980 .for_each(|(file, (_, ext, source))| {
3981 if let Some(call_config) = languages::call_query_for_extension(ext) {
3982 extract_calls(source, &call_config, &mut file.defs);
3983 }
3984 });
3985
3986 // Build the Python class hierarchy (class_name → parent class names) by
3987 // walking every Python source file. The map is used by `resolve_calls`
3988 // Priority 2.5 to dispatch `self.method()` calls through the MRO when
3989 // the method lives on a parent / mixin class (Q1, Wave 2).
3990 //
3991 // Parallel: extract_python_class_hierarchy is pure per-file, then we
3992 // fold the per-file maps into one global map sequentially because
3993 // HashMap is not lock-free. On a 1k-Python-file corpus this fold takes
3994 // <10ms — much smaller than the parallel parse work that feeds it.
3995 let python_hierarchies: Vec<HashMap<String, Vec<String>>> = raw_sources
3996 .par_iter()
3997 .map(|(_, ext, source)| {
3998 if ext == "py" || ext == "pyi" {
3999 extract_python_class_hierarchy(source)
4000 } else {
4001 HashMap::new()
4002 }
4003 })
4004 .collect();
4005 let mut python_class_hierarchy: HashMap<String, Vec<String>> = HashMap::new();
4006 for local in python_hierarchies {
4007 for (k, v) in local {
4008 // First declaration wins on name collisions across files. The
4009 // MRO walk only needs a plausible parent chain to find an
4010 // ancestor's methods; this is conservative but acceptable.
4011 python_class_hierarchy.entry(k).or_insert(v);
4012 }
4013 }
4014
4015 // Resolve call references to target definitions
4016 let def_index = build_def_index(&files);
4017 resolve_calls(&mut files, &def_index, &python_class_hierarchy);
4018
4019 // Build def-level graph, compute PageRank, and derive file-level data
4020 let graph_data = compute_def_graph(&files);
4021
4022 // Build file-level caller/callee lists
4023 let n = files.len();
4024 let (callers, callees) = build_neighbor_lists(n, &graph_data.file_edges);
4025
4026 // Auto-tune alpha based on graph density
4027 #[expect(clippy::cast_precision_loss, reason = "graph sizes fit in f32")]
4028 let density = if n > 1 {
4029 graph_data.file_edges.len() as f32 / (n as f32 * (n as f32 - 1.0))
4030 } else {
4031 0.0
4032 };
4033 let alpha = 0.3f32.mul_add(density.min(1.0), 0.5);
4034
4035 Ok(RepoGraph {
4036 files,
4037 edges: graph_data.file_edges,
4038 base_ranks: graph_data.base_ranks,
4039 callers,
4040 callees,
4041 def_edges: graph_data.def_edges,
4042 def_ranks: graph_data.def_ranks,
4043 def_callers: graph_data.def_callers,
4044 def_callees: graph_data.def_callees,
4045 def_offsets: graph_data.offsets,
4046 alpha,
4047 })
4048}
4049
4050/// Build a `RepoGraph` directly from a pre-constructed `Vec<FileNode>`.
4051///
4052/// Skips the filesystem walk phase of [`build_graph`]; useful for integration
4053/// tests that want to build synthetic graphs without touching disk.
4054///
4055/// Resolves calls, builds the def-level graph (including G3 trait↔impl edges),
4056/// computes `PageRank`, and builds caller/callee lists.
4057#[must_use]
4058pub fn build_graph_from_files_pub(files: Vec<FileNode>) -> RepoGraph {
4059 let empty_hierarchy: HashMap<String, Vec<String>> = HashMap::new();
4060 build_graph_from_files_with_mro_pub(files, &empty_hierarchy)
4061}
4062
4063/// Build a `RepoGraph` from synthetic files with Python MRO resolution enabled.
4064///
4065/// Identical to [`build_graph_from_files_pub`] except that the resolver's
4066/// Priority 2.5 (MRO walk) fires against the caller-supplied
4067/// `python_class_hierarchy` map. Use this when an integration test needs to
4068/// exercise `self.method()` dispatch through inherited classes — including
4069/// the inverse-edge propagation into [`RepoGraph::def_callers`] that
4070/// `lsp_incoming_calls` consumes.
4071///
4072/// I#58 / 4.1.3: when a `SubScreen.method_caller` calls `self.method()` and
4073/// the MRO walk binds the call to `Mixin.method`, the forward edge
4074/// `(SubScreen.method_caller, Mixin.method)` is recorded in
4075/// [`DefGraphData::def_edges`]. [`build_def_neighbor_lists`] then derives
4076/// symmetric inverse-edge entries: `def_callers[Mixin.method]` includes
4077/// `SubScreen.method_caller`. This is the property `lsp_incoming_calls` reads.
4078#[must_use]
4079pub fn build_graph_from_files_with_mro_pub<H>(
4080 mut files: Vec<FileNode>,
4081 python_class_hierarchy: &HashMap<String, Vec<String>, H>,
4082) -> RepoGraph
4083where
4084 H: std::hash::BuildHasher,
4085{
4086 let def_index = build_def_index(&files);
4087 resolve_calls(&mut files, &def_index, python_class_hierarchy);
4088 let graph_data = compute_def_graph(&files);
4089 let n = files.len();
4090 let (callers, callees) = build_neighbor_lists(n, &graph_data.file_edges);
4091
4092 #[expect(clippy::cast_precision_loss, reason = "graph sizes fit in f32")]
4093 let density = if n > 1 {
4094 graph_data.file_edges.len() as f32 / (n as f32 * (n as f32 - 1.0))
4095 } else {
4096 0.0
4097 };
4098 let alpha = 0.3f32.mul_add(density.min(1.0), 0.5);
4099
4100 RepoGraph {
4101 files,
4102 edges: graph_data.file_edges,
4103 base_ranks: graph_data.base_ranks,
4104 callers,
4105 callees,
4106 def_edges: graph_data.def_edges,
4107 def_ranks: graph_data.def_ranks,
4108 def_callers: graph_data.def_callers,
4109 def_callees: graph_data.def_callees,
4110 def_offsets: graph_data.offsets,
4111 alpha,
4112 }
4113}
4114
4115// ── Dead-code analysis ───────────────────────────────────────────────────────
4116
4117/// Global flat definition index: `def_offsets[file_idx] + def_idx_within_file`.
4118///
4119/// This is the natural index into [`RepoGraph::def_ranks`],
4120/// [`RepoGraph::def_callers`], and [`RepoGraph::def_callees`].
4121pub type DefIndex = usize;
4122
4123/// A connected component in the dead-code subgraph.
4124///
4125/// All members are definitions that are unreachable from any entry point.
4126/// The component is formed by treating `def_callees + def_callers` edges as
4127/// undirected — so a transitively-dead group (including mutual-recursion
4128/// cycles that are collectively unreachable) surfaces as one cluster.
4129#[derive(Debug, Clone)]
4130pub struct DeadCluster {
4131 /// Global flat def index of the cluster root (the member with the
4132 /// highest [`RepoGraph::def_ranks`] score).
4133 pub root_def_idx: usize,
4134 /// Number of definitions in this cluster.
4135 pub size: usize,
4136 /// Sum of `end_line - start_line` for every member definition.
4137 pub total_lines: usize,
4138 /// All member global def indices, root first.
4139 pub member_def_indices: Vec<usize>,
4140}
4141
4142/// Confidence level for [`DeadCodeReport::dead_fraction`].
4143///
4144/// Callers should interpret `dead_fraction` according to this indicator.
4145/// When entry-point coverage is sparse (common in test-heavy corpora or
4146/// macro-dispatched frameworks), the raw fraction can be misleading — the
4147/// cluster list is always more trustworthy than the absolute number.
4148///
4149/// ## Rubric
4150///
4151/// | Level | Condition |
4152/// |--------|------------------------------------------------------------------------|
4153/// | High | entry coverage ≥ 10 % AND ≥ 1 LibraryExport AND ≥ 1 Main |
4154/// | Medium | entry coverage ≥ 2 % AND ≥ 1 production entry AND tests not dominant |
4155/// | Low | entry coverage < 2 % OR tests > 80 % of all entries |
4156///
4157/// "Production entries" = LibraryExport + FrameworkDispatched + Main + Ffi.
4158#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
4159#[serde(rename_all = "snake_case")]
4160pub enum DeadCodeConfidence {
4161 /// Entry-point coverage is dense; `dead_fraction` is a quantitative signal.
4162 High,
4163 /// `dead_fraction` is directional; trust the cluster list more than the number.
4164 Medium,
4165 /// `dead_fraction` may mislead (e.g., tests-dominated corpus); use cluster
4166 /// `member_defs` + `lsp_references` to verify each candidate before deletion.
4167 Low,
4168}
4169
4170/// Compute a [`DeadCodeConfidence`] level from entry-point kind counts.
4171///
4172/// Parameters mirror the counts emitted by
4173/// [`crate::entry_points::summarize_entry_point_kinds`]:
4174///
4175/// - `total_defs` — total number of definitions in the graph.
4176/// - `library_exports` — count of [`crate::entry_points::EntryPointKind::LibraryExport`] entries.
4177/// - `framework_dispatched` — count of [`crate::entry_points::EntryPointKind::FrameworkDispatched`] entries.
4178/// - `main_entries` — count of [`crate::entry_points::EntryPointKind::Main`] entries.
4179/// - `test_entries` — count of [`crate::entry_points::EntryPointKind::Test`] entries.
4180/// - `ffi_entries` — count of [`crate::entry_points::EntryPointKind::Ffi`] entries.
4181///
4182/// ProcMacro / Init / BuildScript entries are not passed separately; they are
4183/// rare enough that their absence does not materially affect the heuristic.
4184#[must_use]
4185#[expect(
4186 clippy::cast_precision_loss,
4187 reason = "entry and def counts fit comfortably in f64 for ratio computation"
4188)]
4189pub fn compute_confidence(
4190 total_defs: usize,
4191 library_exports: usize,
4192 framework_dispatched: usize,
4193 main_entries: usize,
4194 test_entries: usize,
4195 ffi_entries: usize,
4196) -> DeadCodeConfidence {
4197 let total_entries =
4198 library_exports + framework_dispatched + main_entries + test_entries + ffi_entries;
4199 let entry_coverage = if total_defs > 0 {
4200 total_entries as f64 / total_defs as f64
4201 } else {
4202 0.0
4203 };
4204 let test_dominant = total_entries > 0 && (test_entries as f64 / total_entries as f64) > 0.8;
4205 let production_entries = library_exports + framework_dispatched + main_entries + ffi_entries;
4206
4207 if entry_coverage >= 0.10 && library_exports >= 1 && main_entries >= 1 {
4208 DeadCodeConfidence::High
4209 } else if entry_coverage >= 0.02 && production_entries >= 1 && !test_dominant {
4210 DeadCodeConfidence::Medium
4211 } else {
4212 DeadCodeConfidence::Low
4213 }
4214}
4215
4216/// Summary report from [`compute_dead_code`].
4217///
4218/// The primary consumer is X3's `mcp__ripvec__find_dead_code` MCP tool.
4219#[derive(Debug, Clone)]
4220pub struct DeadCodeReport {
4221 /// Dead clusters sorted by size descending (largest first).
4222 pub dead_clusters: Vec<DeadCluster>,
4223 /// Total number of definitions unreachable from any entry point.
4224 pub total_dead_defs: usize,
4225 /// Total number of definitions reachable from at least one entry point.
4226 pub total_live_defs: usize,
4227 /// Fraction of all definitions that are dead: `dead / (dead + live)`.
4228 pub dead_fraction: f32,
4229 /// Confidence that `dead_fraction` is a reliable signal.
4230 ///
4231 /// Defaults to [`DeadCodeConfidence::Low`] until the caller supplies
4232 /// entry-point kind counts via [`compute_confidence`].
4233 pub confidence: DeadCodeConfidence,
4234}
4235
4236/// Returns true if the given path is a test-related path.
4237///
4238/// Heuristic: the path contains `tests/`, `/spec/`, `/specs/`, or the
4239/// file stem starts/ends with `test_`/`_test` or contains `bench`.
4240fn is_test_path(path: &str) -> bool {
4241 let path_lc = path.to_lowercase();
4242 if path_lc.contains("tests/") || path_lc.contains("/spec/") || path_lc.contains("/specs/") {
4243 return true;
4244 }
4245 let file_name = path.rsplit('/').next().unwrap_or(path);
4246 let stem = file_name.split('.').next().unwrap_or(file_name);
4247 stem.starts_with("test_") || stem.ends_with("_test") || stem.contains("bench")
4248}
4249
4250/// Resolve a flat [`DefIndex`] to its file index using the `def_offsets`
4251/// prefix-sum table.
4252fn flat_to_file_idx(offsets: &[usize], flat: DefIndex) -> usize {
4253 offsets.partition_point(|&o| o <= flat).saturating_sub(1)
4254}
4255
4256/// Union-find: path-compressing find. Returns the root representative of `x`.
4257fn uf_find(parent: &mut Vec<usize>, x: usize) -> usize {
4258 if parent[x] != x {
4259 parent[x] = uf_find(parent, parent[x]);
4260 }
4261 parent[x]
4262}
4263
4264/// Union-find: merge the components containing `x` and `y`.
4265fn uf_union(parent: &mut Vec<usize>, x: usize, y: usize) {
4266 let rx = uf_find(parent, x);
4267 let ry = uf_find(parent, y);
4268 if rx != ry {
4269 parent[rx] = ry;
4270 }
4271}
4272
4273/// Compute the set of definitions unreachable from any entry point.
4274///
4275/// Returns dead clusters (connected components in the unreachable subgraph),
4276/// sorted by size descending.
4277///
4278/// # Algorithm
4279///
4280/// 1. Optionally filter test paths from `entry_def_indices` (when
4281/// `include_test_paths` is `false`). Test-path heuristic: the file path
4282/// contains `test_`, `_test`, `tests/`, `spec/`, `specs/`, or `bench`.
4283/// 2. Build full forward + reverse adjacency from [`RepoGraph::def_edges`]
4284/// (the untruncated edge list) as a Compressed Sparse Row (CSR) pair
4285/// of (`starts`, `dst`) `Vec<u32>` arrays — bounded O(E) memory rather
4286/// than the O(n_defs * avg_fanout) of duplicated `Vec<Vec<DefIndex>>`
4287/// that crashed at kernel scale (I#61). The BFS does NOT use
4288/// [`RepoGraph::def_callees`] / [`RepoGraph::def_callers`]: those are
4289/// display-oriented neighbor lists capped at [`MAX_NEIGHBORS`] per node,
4290/// which dropped hub callees from the live set and produced
4291/// false-positive dead reports (I#57).
4292/// 3. BFS forward over the full forward adjacency from the entry seeds ->
4293/// reachable set.
4294/// 4. Complement `(all_defs - reachable)` = dead set.
4295/// 5. Connected-components on the dead subgraph via the full forward +
4296/// reverse adjacency treated as undirected (union-find).
4297/// 5. For each cluster: pick the highest-rank def as the cluster root;
4298/// `size` = member count; `total_lines` = sum of `(end_line -
4299/// start_line)` per member.
4300/// 6. Sort clusters by size descending.
4301#[must_use]
4302#[expect(
4303 clippy::too_many_lines,
4304 reason = "six-step BFS+clustering pipeline; splitting into sub-functions \
4305 would require passing many interdependent Vec borrows with no \
4306 clarity gain"
4307)]
4308pub fn compute_dead_code<S: std::hash::BuildHasher>(
4309 graph: &RepoGraph,
4310 entry_def_indices: &HashSet<DefIndex, S>,
4311 include_test_paths: bool,
4312) -> DeadCodeReport {
4313 let n_defs = graph.def_ranks.len();
4314 if n_defs == 0 {
4315 return DeadCodeReport {
4316 dead_clusters: vec![],
4317 total_dead_defs: 0,
4318 total_live_defs: 0,
4319 dead_fraction: 0.0,
4320 confidence: DeadCodeConfidence::Low,
4321 };
4322 }
4323
4324 // Step 1: build the effective seed set.
4325 let seeds: Vec<DefIndex> = if include_test_paths {
4326 entry_def_indices.iter().copied().collect()
4327 } else {
4328 entry_def_indices
4329 .iter()
4330 .copied()
4331 .filter(|&flat| {
4332 let file_idx = flat_to_file_idx(&graph.def_offsets, flat);
4333 let path = graph
4334 .files
4335 .get(file_idx)
4336 .map(|f| f.path.as_str())
4337 .unwrap_or("");
4338 !is_test_path(path)
4339 })
4340 .collect()
4341 };
4342
4343 // Build CSR-style forward + reverse adjacency from `graph.def_edges`.
4344 //
4345 // We do NOT use `graph.def_callees` / `graph.def_callers` here because
4346 // those are display-oriented neighbor lists truncated to MAX_NEIGHBORS
4347 // in `build_def_neighbor_lists`. Reading a truncated forward list during
4348 // BFS produced false-positive dead reports (I#57): a hub function with
4349 // more than 5 outgoing call edges silently dropped its low-weight
4350 // callees from the live set, and every helper transitively reachable
4351 // only through those dropped edges was marked dead.
4352 //
4353 // The original I#57 fix materialized `full_callees` / `full_callers` as
4354 // two `Vec<Vec<DefIndex>>` tables, which at kernel scale (n_defs ≈
4355 // 600 K with high-fan-out hubs) cost ~1.9 GB and OOM-crashed the MCP
4356 // server on every parameter variant (I#61, regression).
4357 //
4358 // This implementation uses Compressed Sparse Row (CSR) bucketing:
4359 // - `fwd_starts[i..=i+1]` slices `fwd_dst` to the destinations of node i.
4360 // - `rev_starts[i..=i+1]` slices `rev_src` to the sources that call i.
4361 // Storage is bounded O(E) (one u32 per directed edge × 2 directions)
4362 // plus O(n_defs) for the two start arrays. For a 600 K-def / 5 M-edge
4363 // kernel graph this is ~80 MB rather than ~1.9 GB.
4364 //
4365 // u32 is sufficient: every flat DefIndex value is < n_defs and ripvec
4366 // already constrains `n_defs <= u32::MAX` upstream (DefId.0 is u32).
4367 #[expect(
4368 clippy::cast_possible_truncation,
4369 reason = "n_defs <= u32::MAX is a ripvec-wide invariant; the per-edge \
4370 cast cannot overflow"
4371 )]
4372 let (fwd_starts, fwd_dst, rev_starts, rev_src) = {
4373 // First pass: per-node out-degree (forward) and in-degree (reverse).
4374 let mut fwd_deg: Vec<u32> = vec![0; n_defs];
4375 let mut rev_deg: Vec<u32> = vec![0; n_defs];
4376 for &(src, dst, _w) in &graph.def_edges {
4377 let src_flat = graph.def_offsets[src.0 as usize] + src.1 as usize;
4378 let dst_flat = graph.def_offsets[dst.0 as usize] + dst.1 as usize;
4379 if src_flat < n_defs && dst_flat < n_defs {
4380 fwd_deg[src_flat] += 1;
4381 rev_deg[dst_flat] += 1;
4382 }
4383 }
4384 // Exclusive prefix sums -> bucket starts of length n_defs + 1.
4385 let mut fwd_starts: Vec<u32> = Vec::with_capacity(n_defs + 1);
4386 let mut rev_starts: Vec<u32> = Vec::with_capacity(n_defs + 1);
4387 let mut acc_f: u32 = 0;
4388 let mut acc_r: u32 = 0;
4389 fwd_starts.push(0);
4390 rev_starts.push(0);
4391 for i in 0..n_defs {
4392 acc_f = acc_f.saturating_add(fwd_deg[i]);
4393 acc_r = acc_r.saturating_add(rev_deg[i]);
4394 fwd_starts.push(acc_f);
4395 rev_starts.push(acc_r);
4396 }
4397 // Second pass: place each edge using a per-node cursor.
4398 let total_edges = acc_f as usize;
4399 let mut fwd_dst: Vec<u32> = vec![0u32; total_edges];
4400 let mut rev_src: Vec<u32> = vec![0u32; total_edges];
4401 // Reuse fwd_deg / rev_deg as cursors counting down from per-bucket capacity.
4402 // Easier: cursors start at the bucket start.
4403 let mut fwd_cursor: Vec<u32> = fwd_starts[..n_defs].to_vec();
4404 let mut rev_cursor: Vec<u32> = rev_starts[..n_defs].to_vec();
4405 for &(src, dst, _w) in &graph.def_edges {
4406 let src_flat = graph.def_offsets[src.0 as usize] + src.1 as usize;
4407 let dst_flat = graph.def_offsets[dst.0 as usize] + dst.1 as usize;
4408 if src_flat < n_defs && dst_flat < n_defs {
4409 let f = fwd_cursor[src_flat] as usize;
4410 fwd_dst[f] = dst_flat as u32;
4411 fwd_cursor[src_flat] += 1;
4412 let r = rev_cursor[dst_flat] as usize;
4413 rev_src[r] = src_flat as u32;
4414 rev_cursor[dst_flat] += 1;
4415 }
4416 }
4417 (fwd_starts, fwd_dst, rev_starts, rev_src)
4418 };
4419
4420 // Helper closures: O(1) bucket lookup into the CSR forward / reverse views.
4421 let fwd_callees = |flat: DefIndex| -> &[u32] {
4422 let lo = fwd_starts[flat] as usize;
4423 let hi = fwd_starts[flat + 1] as usize;
4424 &fwd_dst[lo..hi]
4425 };
4426 let rev_callers = |flat: DefIndex| -> &[u32] {
4427 let lo = rev_starts[flat] as usize;
4428 let hi = rev_starts[flat + 1] as usize;
4429 &rev_src[lo..hi]
4430 };
4431
4432 // Step 2: BFS forward over the CSR forward view from seeds -> reachable set.
4433 let mut reachable: Vec<bool> = vec![false; n_defs];
4434 let mut queue: std::collections::VecDeque<DefIndex> = std::collections::VecDeque::new();
4435
4436 for seed in &seeds {
4437 if *seed < n_defs && !reachable[*seed] {
4438 reachable[*seed] = true;
4439 queue.push_back(*seed);
4440 }
4441 }
4442
4443 while let Some(flat) = queue.pop_front() {
4444 for &callee_flat_u32 in fwd_callees(flat) {
4445 let callee_flat = callee_flat_u32 as usize;
4446 if callee_flat < n_defs && !reachable[callee_flat] {
4447 reachable[callee_flat] = true;
4448 queue.push_back(callee_flat);
4449 }
4450 }
4451 }
4452
4453 // Step 3: dead set = all_defs - reachable.
4454 let dead: Vec<DefIndex> = (0..n_defs).filter(|&i| !reachable[i]).collect();
4455 let total_dead_defs = dead.len();
4456 let total_live_defs = n_defs - total_dead_defs;
4457
4458 if dead.is_empty() {
4459 return DeadCodeReport {
4460 dead_clusters: vec![],
4461 total_dead_defs: 0,
4462 total_live_defs,
4463 dead_fraction: 0.0,
4464 confidence: DeadCodeConfidence::Low,
4465 };
4466 }
4467
4468 // Step 4: connected components on the dead subgraph via union-find.
4469 let dead_set: HashSet<DefIndex> = dead.iter().copied().collect();
4470 let dead_pos: HashMap<DefIndex, usize> = dead
4471 .iter()
4472 .copied()
4473 .enumerate()
4474 .map(|(pos, idx)| (idx, pos))
4475 .collect();
4476 let m = dead.len();
4477 let mut parent: Vec<usize> = (0..m).collect();
4478
4479 // Use the same CSR forward / reverse views built above for the BFS so
4480 // the dead subgraph's connected components reflect every actual call
4481 // edge — not the [`MAX_NEIGHBORS`]-truncated display view (I#57). The CSR
4482 // layout keeps storage O(E) at kernel scale rather than the duplicated
4483 // O(n_defs * avg_fanout) of the pre-I#61 `Vec<Vec<DefIndex>>` (I#61).
4484 for &flat in &dead {
4485 let pos_flat = dead_pos[&flat];
4486 for &callee_flat_u32 in fwd_callees(flat) {
4487 let callee_flat = callee_flat_u32 as usize;
4488 if dead_set.contains(&callee_flat) {
4489 let pos_callee = dead_pos[&callee_flat];
4490 uf_union(&mut parent, pos_flat, pos_callee);
4491 }
4492 }
4493 for &caller_flat_u32 in rev_callers(flat) {
4494 let caller_flat = caller_flat_u32 as usize;
4495 if dead_set.contains(&caller_flat) {
4496 let pos_caller = dead_pos[&caller_flat];
4497 uf_union(&mut parent, pos_flat, pos_caller);
4498 }
4499 }
4500 }
4501
4502 // Flatten roots (path compression for all).
4503 for i in 0..m {
4504 uf_find(&mut parent, i);
4505 }
4506
4507 // Group members by their component root.
4508 let mut components: HashMap<usize, Vec<DefIndex>> = HashMap::new();
4509 for (pos, &flat) in dead.iter().enumerate() {
4510 let root_pos = parent[pos];
4511 components.entry(root_pos).or_default().push(flat);
4512 }
4513
4514 // Step 5: build clusters - root is the highest-rank member.
4515 let mut clusters: Vec<DeadCluster> = components
4516 .into_values()
4517 .map(|members| {
4518 let root_flat = members
4519 .iter()
4520 .copied()
4521 .max_by(|&a, &b| {
4522 let ra = graph.def_ranks.get(a).copied().unwrap_or(0.0);
4523 let rb = graph.def_ranks.get(b).copied().unwrap_or(0.0);
4524 ra.total_cmp(&rb)
4525 })
4526 .unwrap_or(members[0]);
4527
4528 let total_lines: usize = members
4529 .iter()
4530 .copied()
4531 .map(|flat| {
4532 let file_idx = flat_to_file_idx(&graph.def_offsets, flat);
4533 let def_idx = flat - graph.def_offsets[file_idx];
4534 let def = graph.files.get(file_idx).and_then(|f| f.defs.get(def_idx));
4535 def.map(|d| (d.end_line as usize).saturating_sub(d.start_line as usize))
4536 .unwrap_or(0)
4537 })
4538 .sum();
4539
4540 let mut member_def_indices: Vec<usize> = std::iter::once(root_flat)
4541 .chain(members.iter().copied().filter(|&m| m != root_flat))
4542 .collect();
4543 if member_def_indices.len() > 1 {
4544 member_def_indices[1..].sort_unstable();
4545 }
4546
4547 DeadCluster {
4548 root_def_idx: root_flat,
4549 size: member_def_indices.len(),
4550 total_lines,
4551 member_def_indices,
4552 }
4553 })
4554 .collect();
4555
4556 // Step 6: sort by size descending.
4557 clusters.sort_by(|a, b| {
4558 b.size
4559 .cmp(&a.size)
4560 .then(b.root_def_idx.cmp(&a.root_def_idx))
4561 });
4562
4563 #[expect(
4564 clippy::cast_precision_loss,
4565 reason = "def counts fit comfortably in f32"
4566 )]
4567 let dead_fraction = if n_defs > 0 {
4568 total_dead_defs as f32 / n_defs as f32
4569 } else {
4570 0.0
4571 };
4572
4573 DeadCodeReport {
4574 dead_clusters: clusters,
4575 total_dead_defs,
4576 total_live_defs,
4577 dead_fraction,
4578 // Caller must set confidence via compute_confidence() after supplying
4579 // entry-point kind counts. Defaulting to Low is conservative.
4580 confidence: DeadCodeConfidence::Low,
4581 }
4582}
4583
4584impl RepoGraph {
4585 /// Get the `PageRank` score for a specific definition.
4586 #[must_use]
4587 pub fn def_rank(&self, did: DefId) -> f32 {
4588 let flat = self.def_offsets[did.0 as usize] + did.1 as usize;
4589 self.def_ranks.get(flat).copied().unwrap_or(0.0)
4590 }
4591
4592 /// Look up a definition by file path and name. Returns the first match.
4593 #[must_use]
4594 pub fn find_def(&self, file_path: &str, def_name: &str) -> Option<DefId> {
4595 for (file_idx, file) in self.files.iter().enumerate() {
4596 if file.path == file_path {
4597 for (def_idx, def) in file.defs.iter().enumerate() {
4598 if def.name == def_name {
4599 #[expect(clippy::cast_possible_truncation)]
4600 return Some((file_idx as u32, def_idx as u16));
4601 }
4602 }
4603 }
4604 }
4605 None
4606 }
4607
4608 /// Resolve a caller-supplied `focus_file` string to a file index in [`Self::files`].
4609 ///
4610 /// Accepts any of the path forms that ripvec itself emits or accepts:
4611 ///
4612 /// - **Exact stored path** (`device_opt/services/storage.py`) — direct match.
4613 /// - **LSP-shaped path** (`./device_opt/services/storage.py`) — the `./`
4614 /// prefix used by every [`RepoMapLspLocation::file_path`] is stripped
4615 /// before comparison so the documented chaining pattern
4616 /// `get_repo_map(focus_file=hits[0].lsp_location.file_path)` works.
4617 /// - **Strict suffix** (`storage.py`, `services/storage.py`) — match when
4618 /// the previous character in the stored path is `/`. Avoids matching
4619 /// `foo_storage.py` for `storage.py`.
4620 ///
4621 /// Returns [`FocusResolution::Found`] when exactly one file matches,
4622 /// [`FocusResolution::Ambiguous`] when multiple files match (the caller
4623 /// surfaces the candidate list to the user), and [`FocusResolution::NotFound`]
4624 /// when no file matches.
4625 ///
4626 /// # Background
4627 ///
4628 /// Prior to this helper the MCP layer (`crates/ripvec-mcp/src/tools.rs`)
4629 /// did the matching inline with two bugs:
4630 ///
4631 /// 1. **`./` prefix mismatch.** [`RepoMapLspLocation::file_path`] always
4632 /// carries a leading `./` (see [`file_lsp_location`]), but
4633 /// [`FileNode::path`] does not. Passing the LSP location verbatim as
4634 /// `focus_file` matched zero files. The matcher silently returned
4635 /// `focus = None`, producing rank values bit-identical to the unfocused
4636 /// call — the bug originally reported as "I#20 focus_file rebias
4637 /// invisible on Python".
4638 /// 2. **Equal-length false negative.** When the user passed
4639 /// `./device_opt/services/storage.py` and the stored path was
4640 /// `device_opt/services/storage.py`, `exact` was false (the strings
4641 /// differ by two bytes) and `strict_suffix` was false (the focus is
4642 /// longer than the stored path, so `p.len() > focus.len()` fails). The
4643 /// pathology surfaced specifically when the focus was a *full* path
4644 /// with the LSP `./` prefix.
4645 ///
4646 /// Centralising the resolution here gives every caller the same
4647 /// normalization-tolerant semantics and one place to test the contract.
4648 #[must_use]
4649 pub fn resolve_focus_file(&self, focus: &str) -> FocusResolution {
4650 let normalized = normalize_focus_path(focus);
4651 let matches: Vec<usize> = self
4652 .files
4653 .iter()
4654 .enumerate()
4655 .filter_map(|(idx, f)| {
4656 if focus_matches_path(&f.path, normalized) {
4657 Some(idx)
4658 } else {
4659 None
4660 }
4661 })
4662 .collect();
4663 match matches.len() {
4664 0 => FocusResolution::NotFound,
4665 1 => FocusResolution::Found(matches[0]),
4666 _ => FocusResolution::Ambiguous(
4667 matches
4668 .into_iter()
4669 .map(|i| self.files[i].path.clone())
4670 .collect(),
4671 ),
4672 }
4673 }
4674}
4675
4676/// Result of resolving a user-supplied `focus_file` string against a [`RepoGraph`].
4677///
4678/// See [`RepoGraph::resolve_focus_file`] for the resolution semantics and the
4679/// historical bug that motivated the helper.
4680#[derive(Debug, Clone)]
4681pub enum FocusResolution {
4682 /// Exactly one file matched. Carries the file index in [`RepoGraph::files`].
4683 Found(usize),
4684 /// No file matched. The caller treats this as an unfocused call.
4685 NotFound,
4686 /// Two or more files matched. The caller surfaces the candidate list so
4687 /// the user can disambiguate by passing a longer suffix or the full path.
4688 Ambiguous(Vec<String>),
4689}
4690
4691/// Strip the leading `./` prefix from a focus_file path.
4692///
4693/// The `./` form is produced by [`file_lsp_location`] for every
4694/// [`RepoMapLspLocation::file_path`] field on a relative path. Stripping it
4695/// gives a stored-path-shaped value for the suffix matcher to compare
4696/// against [`FileNode::path`] entries (which do not carry the prefix).
4697///
4698/// Absolute paths (`/abs/path/file.py`) are returned unchanged; they will
4699/// fail the suffix match against the relative stored paths, which is the
4700/// correct behavior (the caller meant a different root entirely).
4701fn normalize_focus_path(focus: &str) -> &str {
4702 focus.strip_prefix("./").unwrap_or(focus)
4703}
4704
4705/// Return true when `focus` matches `stored_path` as either an exact path or
4706/// a strict-suffix (must be preceded by `/`). The empty focus does not match.
4707fn focus_matches_path(stored_path: &str, focus: &str) -> bool {
4708 if focus.is_empty() {
4709 return false;
4710 }
4711 if stored_path == focus {
4712 return true;
4713 }
4714 stored_path.len() > focus.len()
4715 && stored_path.ends_with(focus)
4716 && stored_path.as_bytes()[stored_path.len() - focus.len() - 1] == b'/'
4717}
4718
4719/// Build top-N caller and callee lists for each file.
4720///
4721/// Given a list of weighted directed edges `(src, dst, weight)` over `n`
4722/// nodes, returns `(callers[i], callees[i])` for each node `i`, where each
4723/// list contains the top-[`MAX_NEIGHBORS`] adjacent nodes sorted by descending
4724/// edge weight.
4725///
4726/// Exposed as `pub` so that integration tests can construct synthetic
4727/// [`RepoGraph`] instances for unit-testing the JSON rendering without going
4728/// through a full disk walk.
4729#[must_use]
4730pub fn build_neighbor_lists(n: usize, edges: &[(u32, u32, u32)]) -> (Vec<Vec<u32>>, Vec<Vec<u32>>) {
4731 let mut incoming: Vec<Vec<(u32, u32)>> = vec![vec![]; n];
4732 let mut outgoing: Vec<Vec<(u32, u32)>> = vec![vec![]; n];
4733
4734 for &(src, dst, w) in edges {
4735 let (s, d) = (src as usize, dst as usize);
4736 if s < n && d < n {
4737 incoming[d].push((src, w));
4738 outgoing[s].push((dst, w));
4739 }
4740 }
4741
4742 // Sort by weight descending, keep top N
4743 let trim = |lists: &mut [Vec<(u32, u32)>]| -> Vec<Vec<u32>> {
4744 lists
4745 .iter_mut()
4746 .map(|list| {
4747 list.sort_by_key(|b| std::cmp::Reverse(b.1));
4748 list.iter()
4749 .take(MAX_NEIGHBORS)
4750 .map(|(idx, _)| *idx)
4751 .collect()
4752 })
4753 .collect()
4754 };
4755
4756 (trim(&mut incoming), trim(&mut outgoing))
4757}
4758
4759// ── Rendering ────────────────────────────────────────────────────────
4760
4761/// Render a budget-constrained overview of the repository.
4762///
4763/// Files are sorted by `PageRank` (or topic-sensitive rank if `focus` is
4764/// `Some`). Output uses four tiers of decreasing detail:
4765///
4766/// - **Tier 0** (top 10%): full path, rank, callers/callees, signatures with scopes
4767/// - **Tier 1** (next 20%): full path, rank, signatures
4768/// - **Tier 2** (next 40%): full path, rank, definition names and kinds
4769/// - **Tier 3** (bottom 30%): file path only
4770///
4771/// Stops accumulating output when the estimated token count exceeds
4772/// `max_tokens`.
4773#[must_use]
4774pub fn render(graph: &RepoGraph, max_tokens: usize, focus: Option<usize>) -> String {
4775 let n = graph.files.len();
4776 if n == 0 {
4777 return String::new();
4778 }
4779
4780 // Compute ranks (recompute topic-sensitive if focus is given)
4781 let ranks = if focus.is_some() {
4782 pagerank(n, &graph.edges, focus)
4783 } else {
4784 graph.base_ranks.clone()
4785 };
4786
4787 // Sort file indices by rank descending
4788 let mut sorted: Vec<usize> = (0..n).collect();
4789 sorted.sort_by(|&a, &b| ranks[b].total_cmp(&ranks[a]));
4790
4791 let mut output = String::new();
4792 let mut used_tokens = 0;
4793 let max_chars = max_tokens * CHARS_PER_TOKEN;
4794
4795 for (rank_pos, &file_idx) in sorted.iter().enumerate() {
4796 if used_tokens >= max_tokens {
4797 break;
4798 }
4799
4800 let file = &graph.files[file_idx];
4801 let score = ranks[file_idx];
4802 #[expect(clippy::cast_precision_loss, reason = "file counts fit in f32")]
4803 let percentile = (rank_pos as f32) / (n as f32);
4804
4805 let section = if percentile < 0.1 {
4806 render_tier0(graph, file_idx, file, score)
4807 } else if percentile < 0.3 {
4808 render_tier1(file, score)
4809 } else if percentile < 0.7 {
4810 render_tier2(file, score)
4811 } else {
4812 render_tier3(file)
4813 };
4814
4815 let section_chars = section.len();
4816 if used_tokens > 0 && used_tokens + section_chars / CHARS_PER_TOKEN > max_tokens {
4817 // Would exceed budget — try to fit at least the path
4818 let path_line = format!("{}\n", file.path);
4819 let path_tokens = path_line.len() / CHARS_PER_TOKEN;
4820 if used_tokens + path_tokens <= max_tokens {
4821 output.push_str(&path_line);
4822 }
4823 break;
4824 }
4825
4826 output.push_str(§ion);
4827 used_tokens = output.len().min(max_chars) / CHARS_PER_TOKEN;
4828 }
4829
4830 output
4831}
4832
4833/// Render tier 0: full detail with callers, callees, and signatures.
4834fn render_tier0(graph: &RepoGraph, file_idx: usize, file: &FileNode, score: f32) -> String {
4835 let mut out = format!("## {} (rank: {score:.4})\n", file.path);
4836
4837 // Callers
4838 if file_idx < graph.callers.len() && !graph.callers[file_idx].is_empty() {
4839 let _ = write!(out, " called by: ");
4840 let names: Vec<&str> = graph.callers[file_idx]
4841 .iter()
4842 .filter_map(|&idx| graph.files.get(idx as usize).map(|f| f.path.as_str()))
4843 .collect();
4844 let _ = writeln!(out, "{}", names.join(", "));
4845 }
4846
4847 // Callees
4848 if file_idx < graph.callees.len() && !graph.callees[file_idx].is_empty() {
4849 let _ = write!(out, " calls: ");
4850 let names: Vec<&str> = graph.callees[file_idx]
4851 .iter()
4852 .filter_map(|&idx| graph.files.get(idx as usize).map(|f| f.path.as_str()))
4853 .collect();
4854 let _ = writeln!(out, "{}", names.join(", "));
4855 }
4856
4857 // Definitions with scope and signature
4858 for def in &file.defs {
4859 let scope_prefix = if def.scope.is_empty() {
4860 String::new()
4861 } else {
4862 format!("{} > ", def.scope)
4863 };
4864 if let Some(sig) = &def.signature {
4865 let _ = writeln!(out, " {scope_prefix}{} {sig}", def.kind);
4866 } else {
4867 let _ = writeln!(out, " {scope_prefix}{} {}", def.kind, def.name);
4868 }
4869 }
4870 let _ = writeln!(out);
4871 out
4872}
4873
4874/// Render tier 1: file path, rank, and signatures.
4875fn render_tier1(file: &FileNode, score: f32) -> String {
4876 let mut out = format!("## {} (rank: {score:.4})\n", file.path);
4877 for def in &file.defs {
4878 if let Some(sig) = &def.signature {
4879 let _ = writeln!(out, " {sig}");
4880 } else {
4881 let _ = writeln!(out, " {} {}", def.kind, def.name);
4882 }
4883 }
4884 let _ = writeln!(out);
4885 out
4886}
4887
4888/// Render tier 2: file path, rank, and definition names/kinds.
4889fn render_tier2(file: &FileNode, score: f32) -> String {
4890 let mut out = format!("{} (rank: {score:.4})", file.path);
4891 if !file.defs.is_empty() {
4892 let names: Vec<String> = file
4893 .defs
4894 .iter()
4895 .map(|d| format!("{}:{}", d.kind, d.name))
4896 .collect();
4897 let _ = write!(out, " -- {}", names.join(", "));
4898 }
4899 let _ = writeln!(out);
4900 out
4901}
4902
4903/// Render tier 3: file path only.
4904fn render_tier3(file: &FileNode) -> String {
4905 format!("{}\n", file.path)
4906}
4907
4908// ── JSON rendering ───────────────────────────────────────────────────
4909
4910/// Build the `lsp_location` for a file itself (line 0).
4911fn file_lsp_location(path: &str) -> RepoMapLspLocation {
4912 RepoMapLspLocation {
4913 file_path: if path.starts_with("./") || path.starts_with('/') {
4914 path.to_string()
4915 } else {
4916 format!("./{path}")
4917 },
4918 start_line: 0,
4919 start_character: 0,
4920 end_line: 0,
4921 end_character: 0,
4922 }
4923}
4924
4925/// Infer `ContentKind` from a file path's extension.
4926fn content_kind_for_path(path: &str) -> ContentKind {
4927 let ext = std::path::Path::new(path)
4928 .extension()
4929 .and_then(|e| e.to_str())
4930 .unwrap_or("");
4931 ContentKind::from_extension(ext)
4932}
4933
4934/// Minimum byte envelope reserved for each included file.
4935///
4936/// Even a file with zero symbols takes JSON overhead for path, rank, arrays,
4937/// etc. Calibrated against actual serde_json output for an empty `RepoMapFile`:
4938/// `{"lsp_location":{"file_path":"./src/file_N.rs","start_line":0,"start_character":0,`
4939/// `"end_line":0,"end_character":0},"rank":0.1234,"content_kind":"code",`
4940/// `"calls":[],"symbols":[],"truncated_symbols":0,"truncated_calls":0}` ≈ 250 bytes.
4941///
4942/// This floor prevents the budget allocator from giving a file so little space
4943/// that it can emit no envelope at all.
4944const FILE_ENVELOPE_MIN_BYTES: usize = 250;
4945
4946/// Minimum useful payload for an admitted file: envelope plus room for at
4947/// least 2-3 typical-sized symbols. Files whose fair share cannot meet this
4948/// floor are excluded entirely (Fix A, 4.0.2). Without this guard, low-rank
4949/// tail files consume budget on envelopes that contain no symbols or calls,
4950/// crowding out content for the top files.
4951const FILE_MIN_USEFUL_BYTES: usize = 600;
4952
4953/// Fraction of each file's per-file budget reserved for outgoing-call edges
4954/// after the envelope is paid. The remaining (1 - this) fraction goes to
4955/// symbols. Symbol leftover flows into calls; call leftover flows to the
4956/// next file. (Fix C, 4.0.2 — without a reserve, the symbol loop saturates
4957/// the per-file budget and calls always come up empty.)
4958///
4959/// This fraction is the FLOOR: if the stored callees' exact byte cost exceeds
4960/// `post_envelope * CALLS_BUDGET_FRACTION`, the calls reserve is raised to the
4961/// exact cost (capped at `CALLS_MAX_FRACTION`). See B-0059 / 4.1.10.
4962const CALLS_BUDGET_FRACTION: f64 = 0.30;
4963
4964/// Maximum fraction of `post_envelope` that may be reserved for calls.
4965///
4966/// Prevents calls from starving symbols entirely when there are many callees
4967/// with long paths. `CALLS_BUDGET_FRACTION` sets a floor; this sets a ceiling.
4968/// Together they implement the precise-reserve strategy (B-0059 / 4.1.10):
4969/// the reserve is `clamp(exact_calls_bytes, fraction_floor, fraction_ceil)`.
4970const CALLS_MAX_FRACTION: f64 = 0.70;
4971
4972/// Maximum fraction of the total budget that a single file may claim.
4973///
4974/// Without this cap a single very-high-rank file (e.g. `lib.rs`) could
4975/// consume the entire budget, leaving all other files empty.
4976const MAX_FILE_SHARE: f64 = 0.40;
4977
4978/// AST kind priority for orientation-style symbol ordering. Higher = surface
4979/// earlier. Used when def-level PageRank is degenerate (most ranks near zero)
4980/// to fall back on structural signal rather than noise.
4981///
4982/// The intuition: a reader orienting in a codebase wants to see the file's
4983/// *shape* before its *behaviors*. Types declare shape; functions declare
4984/// behavior; fields and constants are internal detail. This ordering matches
4985/// how humans read code top-down. (Fix B, 4.0.2.)
4986fn ast_kind_priority(kind: &str) -> u32 {
4987 match kind {
4988 // Tier 3: shape — what THIS file is
4989 "trait_item" | "interface" | "trait" => 30,
4990 "struct_item" | "struct" | "class_definition" | "class" => 29,
4991 "enum_item" | "enum" => 28,
4992 "type_item" | "type_alias_declaration" | "type_alias" => 27,
4993 "mod_item" | "module" | "namespace" => 26,
4994 // Tier 2: behavior — what THIS file does
4995 "function_item" | "function_definition" | "function" | "method_definition" => 20,
4996 "impl_item" | "impl" => 19,
4997 // Tier 1: declarations
4998 "const_item" | "const_declaration" | "const" => 10,
4999 "static_item" | "static" => 9,
5000 // Tier 0: internals (fields, variables, parameters)
5001 _ => 0,
5002 }
5003}
5004
5005/// Effective AST priority with corpus-relative rank promotion (4.0.4).
5006///
5007/// Preserves the 4.0.2 AST-priority ordering by default (types first,
5008/// then functions, then fields). When a def's PageRank significantly
5009/// exceeds the corpus median, promotes it up one or two tiers so that
5010/// load-bearing defs surface alongside their declared-tier neighbors.
5011///
5012/// Thresholds are corpus-median multiples (self-calibrating):
5013/// - rank > 4× median → +1 tier (e.g., hot function joins type tier)
5014/// - rank > 16× median → +2 tiers (extremely hot def)
5015/// - otherwise → declared tier preserved
5016///
5017/// On degenerate (flat) rank distributions the median equals the floor,
5018/// nothing crosses threshold, and 4.0.2 AST-priority ordering is fully
5019/// preserved. On informative distributions (post-4.0.3 enrichment),
5020/// hot defs surface proportionally.
5021fn effective_priority(kind: &str, def_rank: f32, promo_1: f32, promo_2: f32) -> u32 {
5022 let base = ast_kind_priority(kind);
5023 // Accumulate promotion tiers as a plain integer to satisfy clippy's
5024 // bool_to_int_with_if lint while preserving branch clarity.
5025 let promo_tiers: u32 = u32::from(def_rank > promo_1) + u32::from(def_rank > promo_2);
5026 // Tier spacing matches ast_kind_priority's 10-unit gaps.
5027 base + promo_tiers * 10
5028}
5029
5030/// Estimate the serialised JSON byte cost of one `RepoMapSymbol`.
5031///
5032/// Calibrated against actual serde_json output. A `RepoMapSymbol` serialises to
5033/// approximately:
5034/// `{"name":"<N>","kind":<K>,"lsp_location":{"file_path":"<P>","start_line":0,`
5035/// `"start_character":0,"end_line":0,"end_character":0},"rank":<R>}`
5036///
5037/// That is ~165 bytes of overhead (braces, keys, fixed-width integers, rank)
5038/// plus the name length and file_path length. We pass the path length
5039/// separately because the path is the same for all symbols in one file.
5040fn estimate_symbol_bytes(name: &str) -> usize {
5041 // 165 bytes overhead + name length.
5042 // The file_path is not included here because it is part of the
5043 // envelope cost accounted separately.
5044 165 + name.len()
5045}
5046
5047/// Estimate the serialised JSON byte cost of one `RepoMapCall`.
5048///
5049/// Each call entry: `{"lsp_location":{"file_path":"<P>","start_line":0,`
5050/// `"start_character":0,"end_line":0,"end_character":0},"rank":<R>}`
5051/// ≈ 120 bytes overhead + path length.
5052fn estimate_call_bytes(target_path: &str) -> usize {
5053 120 + target_path.len()
5054}
5055
5056/// Render a `PageRank`-weighted JSON map with token-budget allocation (4.0.1).
5057///
5058/// # Algorithm
5059///
5060/// **Step 1 — File-share allocation.** Each eligible file receives a byte
5061/// budget proportional to its `base_rank`. The share is capped at 40% of
5062/// `budget_total_bytes` and floored at [`FILE_ENVELOPE_MIN_BYTES`] (200 B).
5063/// Files are included in rank order until the cumulative allocation would
5064/// exceed the total budget.
5065///
5066/// **Step 2 — Per-file symbol fill.** For each included file, symbols are
5067/// walked in def-rank descending order. Inclusion continues until either (a)
5068/// the file's budget share is exhausted (with carry-over of leftover bytes to
5069/// the next file) or (b) a logarithmic attenuation cutoff fires: symbol at
5070/// position `i` (0-based) is included only if its rank ≥ `top_rank /
5071/// (1 + ln(i + 1))`. `calls[]` is filled in target-file base-rank order
5072/// subject to a hard [`MAX_FILE_CALLS`] render cap (no attenuation — I#68,
5073/// 4.1.4) and per-file byte budget. `truncated_symbols` and `truncated_calls`
5074/// track the count of omitted entries.
5075///
5076/// **Step 3 — Response telemetry.** The response includes `estimated_bytes`
5077/// (actual returned content size), `budget_bytes` (`token_budget * 4`),
5078/// and `budget_exhausted` (`total_files > files.len()`).
5079///
5080/// # Arguments
5081///
5082/// - `graph` — the built dependency graph.
5083/// - `token_budget` — caller-specified token budget (× 4 = byte budget).
5084/// - `focus` — optional file index for topic-sensitive `PageRank`.
5085/// - `include_metadata` — when `false` (default), Meta-classified files
5086/// are excluded before ranking.
5087#[must_use]
5088#[expect(
5089 clippy::cast_precision_loss,
5090 reason = "rank sums and counts are small f32/f64; precision loss is acceptable"
5091)]
5092#[expect(
5093 clippy::too_many_lines,
5094 reason = "the three-step allocation algorithm (file-share → symbol-fill → calls-fill) \
5095 is sequential and share state; splitting into helpers would require passing \
5096 mutable slices across three boundaries with no clarity gain"
5097)]
5098pub fn render_json_budgeted(
5099 graph: &RepoGraph,
5100 token_budget: usize,
5101 focus: Option<usize>,
5102 include_metadata: bool,
5103) -> GetRepoMapResponse {
5104 let n = graph.files.len();
5105 if n == 0 {
5106 let budget_bytes = token_budget * CHARS_PER_TOKEN;
5107 return GetRepoMapResponse {
5108 files: vec![],
5109 total_files: 0,
5110 estimated_bytes: 0,
5111 budget_bytes,
5112 budget_exhausted: false,
5113 capped: false,
5114 };
5115 }
5116
5117 let budget_total_bytes = token_budget * CHARS_PER_TOKEN;
5118
5119 // Recompute topic-sensitive ranks if focus is given.
5120 let ranks = if focus.is_some() {
5121 pagerank(n, &graph.edges, focus)
5122 } else {
5123 graph.base_ranks.clone()
5124 };
5125
5126 // Sort all file indices by rank descending.
5127 let mut sorted: Vec<usize> = (0..n).collect();
5128 sorted.sort_by(|&a, &b| ranks[b].total_cmp(&ranks[a]));
5129
5130 // Apply metadata exclusion filter.
5131 let eligible: Vec<usize> = if include_metadata {
5132 sorted
5133 } else {
5134 sorted
5135 .into_iter()
5136 .filter(|&idx| {
5137 let kind = content_kind_for_path(&graph.files[idx].path);
5138 kind != ContentKind::Meta
5139 })
5140 .collect()
5141 };
5142
5143 let total_files = eligible.len();
5144
5145 // ── Corpus-median def-rank thresholds for tier promotion (4.0.4) ────────
5146 //
5147 // Compute once per call (corpus-wide, not per-file) so the threshold is
5148 // self-calibrating: flat distributions (all ranks equal) set median = floor
5149 // and nothing crosses threshold; informative distributions see proportional
5150 // promotion. Using corpus-wide median ensures a hot function in one file is
5151 // judged against the entire corpus, not just its local file peers.
5152 // Use the 75th percentile of nonzero def-ranks as the corpus reference value
5153 // for tier promotion (rather than the 50th percentile / median). The 75th
5154 // percentile is more robust: on a flat distribution most defs cluster near the
5155 // floor, so the 75th percentile is only marginally above the floor (making the
5156 // 4× threshold very selective). On an informative distribution (post-4.0.3
5157 // call-edge enrichment) the 75th percentile is meaningfully above the floor,
5158 // so the same 4× multiplier captures genuinely hot defs without falsely
5159 // promoting slightly-above-floor helpers.
5160 //
5161 // The 50th percentile (lower median) was rejected because on a 10-def corpus
5162 // with max/min ratio 5× the median equals the floor, causing the 4× threshold
5163 // to fire on defs that are only 5× above floor (a low-variance corpus). The
5164 // 75th percentile corrects this without requiring hand-tuned per-corpus magic
5165 // numbers.
5166 let corpus_reference_rank: f32 = {
5167 let mut nonzero: Vec<f32> = graph
5168 .def_ranks
5169 .iter()
5170 .copied()
5171 .filter(|r| *r > 0.0)
5172 .collect();
5173 if nonzero.is_empty() {
5174 0.0
5175 } else {
5176 nonzero.sort_unstable_by(f32::total_cmp);
5177 let n = nonzero.len();
5178 // 75th percentile index: floor(0.75 * (n - 1))
5179 let idx = (3 * (n - 1)) / 4;
5180 nonzero[idx]
5181 }
5182 };
5183 let promo_1_threshold = corpus_reference_rank * 4.0; // +1 tier
5184 let promo_2_threshold = corpus_reference_rank * 16.0; // +2 tiers
5185
5186 // ── Step 1: File-share allocation ────────────────────────────────
5187
5188 // Greedily determine which files fit within the budget, computing each
5189 // file's share as it is added. We must run a two-pass approach:
5190 // pass A: determine which files are included (cumulative sum check),
5191 // pass B: fill symbols/calls using final per-file allocations.
5192 //
5193 // The "included" decision is based on the running cumulative sum so that
5194 // the leftover redistribution in step 2 can carry forward correctly.
5195
5196 // Floor-first admission (Fix A, 4.0.2):
5197 //
5198 // Cap admitted file count so each gets at least FILE_MIN_USEFUL_BYTES.
5199 // Below this threshold the response would carry envelopes that contain
5200 // no symbols or calls — pure overhead, no information. Concentrating
5201 // the budget on fewer files with real content is strictly better for
5202 // orientation than dropping envelope sentinels for many files.
5203 let max_admissible = budget_total_bytes / FILE_MIN_USEFUL_BYTES;
5204 let admit_count = eligible.len().min(max_admissible.max(1));
5205
5206 let budget_f64 = budget_total_bytes as f64;
5207
5208 // Pre-compute rank sum across ADMITTED files only (top-N by rank). f64
5209 // to avoid precision loss when summing many small f32 values.
5210 let admitted_rank_sum: f64 = eligible
5211 .iter()
5212 .take(admit_count)
5213 .map(|&idx| f64::from(ranks[idx]))
5214 .sum();
5215 let admitted_rank_sum = if admitted_rank_sum > 0.0 {
5216 admitted_rank_sum
5217 } else {
5218 1.0
5219 };
5220
5221 // Compute per-file budgets. Each admitted file gets at least
5222 // FILE_MIN_USEFUL_BYTES; the proportional-to-rank share is applied on
5223 // top of the floor and capped at MAX_FILE_SHARE.
5224 let mut included_indices: Vec<usize> = Vec::new(); // indices into `eligible`
5225 let mut file_budgets: Vec<usize> = Vec::new();
5226 let mut cumulative_budget: usize = 0;
5227
5228 for (i, &file_idx) in eligible.iter().take(admit_count).enumerate() {
5229 let file_rank = f64::from(ranks[file_idx]);
5230 let raw_share = budget_f64 * file_rank / admitted_rank_sum;
5231 let capped = raw_share.min(budget_f64 * MAX_FILE_SHARE);
5232 // `capped` is non-negative and bounded by budget_f64 (a usize).
5233 #[expect(
5234 clippy::cast_possible_truncation,
5235 clippy::cast_sign_loss,
5236 reason = "capped is non-negative and bounded by budget_total_bytes (a usize)"
5237 )]
5238 let budget_i = (capped as usize).max(FILE_MIN_USEFUL_BYTES);
5239
5240 if cumulative_budget + budget_i > budget_total_bytes && !included_indices.is_empty() {
5241 break;
5242 }
5243 cumulative_budget += budget_i;
5244 included_indices.push(i);
5245 file_budgets.push(budget_i);
5246 }
5247
5248 // ── Step 2: Per-file symbol fill ─────────────────────────────────
5249
5250 let mut result_files: Vec<RepoMapFile> = Vec::with_capacity(included_indices.len());
5251 let mut leftover: usize = 0; // unused bytes carried from previous file
5252
5253 for (slot, &eligible_i) in included_indices.iter().enumerate() {
5254 let file_idx = eligible[eligible_i];
5255 let file = &graph.files[file_idx];
5256 let file_rank = ranks[file_idx];
5257 let file_path_lsp = file_lsp_location(&file.path);
5258
5259 let budget_in = file_budgets[slot] + leftover;
5260
5261 // Pre-compute callee indices here (before the budget split) so the
5262 // precise byte cost of rendering all stored callees is available when
5263 // setting calls_reserve. This is the B-0059 fix (4.1.10): the old code
5264 // placed callee_indices after the symbol loop, so calls_reserve was
5265 // always the fraction-floor — even when the stored callees needed more.
5266 let callee_indices: Vec<usize> = if file_idx < graph.callees.len() {
5267 let mut callees: Vec<(usize, f32)> = graph.callees[file_idx]
5268 .iter()
5269 .filter_map(|&ci| {
5270 let ci = ci as usize;
5271 graph.files.get(ci).map(|_| {
5272 let r = graph.base_ranks.get(ci).copied().unwrap_or(0.0);
5273 (ci, r)
5274 })
5275 })
5276 .collect();
5277 callees.sort_unstable_by(|a, b| b.1.total_cmp(&a.1));
5278 callees.into_iter().map(|(ci, _)| ci).collect()
5279 } else {
5280 vec![]
5281 };
5282
5283 // Reserve a fraction of the post-envelope budget for outgoing calls
5284 // (Fix C, 4.0.2). Without this guard the symbol loop saturates
5285 // `budget_in` and the calls loop always trips its byte-check.
5286 // Symbol leftover flows into calls; call leftover flows to the
5287 // next file via the outer `leftover` variable.
5288 //
5289 // B-0059 (4.1.10): raise the reserve to the exact byte cost of
5290 // rendering all stored callees (up to MAX_FILE_CALLS), clamped to
5291 // [fraction_floor, CALLS_MAX_FRACTION * post_envelope]. This ensures
5292 // that at any budget where calls physically fit, they are not crowded
5293 // out by symbols consuming the fraction-floor reserve.
5294 let post_envelope = budget_in.saturating_sub(FILE_ENVELOPE_MIN_BYTES);
5295 let precise_calls_bytes: usize = callee_indices
5296 .iter()
5297 .take(MAX_FILE_CALLS)
5298 .map(|&ci| estimate_call_bytes(&graph.files[ci].path))
5299 .sum();
5300 #[expect(
5301 clippy::cast_possible_truncation,
5302 clippy::cast_sign_loss,
5303 reason = "post_envelope * fraction is bounded by post_envelope (a usize); result is non-negative"
5304 )]
5305 let fraction_floor = (post_envelope as f64 * CALLS_BUDGET_FRACTION) as usize;
5306 #[expect(
5307 clippy::cast_possible_truncation,
5308 clippy::cast_sign_loss,
5309 reason = "post_envelope * fraction is bounded by post_envelope (a usize); result is non-negative"
5310 )]
5311 let fraction_ceil = (post_envelope as f64 * CALLS_MAX_FRACTION) as usize;
5312 let calls_reserve = precise_calls_bytes.max(fraction_floor).min(fraction_ceil);
5313 let symbols_budget = FILE_ENVELOPE_MIN_BYTES + post_envelope.saturating_sub(calls_reserve);
5314 let mut used: usize = FILE_ENVELOPE_MIN_BYTES; // envelope cost
5315
5316 // ── Symbols ──────────────────────────────────────────────────
5317 // Retrieve def-level ranks for this file via the offset table.
5318 let def_count = file.defs.len();
5319 let def_offset = if file_idx < graph.def_offsets.len() {
5320 graph.def_offsets[file_idx]
5321 } else {
5322 0
5323 };
5324
5325 // Build (def_idx, rank, kind_priority, start_byte) tuples. We sort
5326 // by a composite key: AST kind priority (descending) — putting types
5327 // before functions before fields — then by def_rank (descending)
5328 // within each tier. This is Fix B (4.0.2): the def_rank distribution
5329 // is often degenerate (most defs share near-zero rank because the
5330 // call-edge extractor doesn't capture every dispatch), so we use
5331 // structural signal as the primary ordering and def_rank as the
5332 // within-tier tiebreaker. When def_rank IS informative, it dominates
5333 // *within* its kind tier and recovers the original behavior; the AST
5334 // signal only shifts ordering *between* tiers.
5335 let mut def_rank_pairs: Vec<(usize, f32, u32, u32)> = (0..def_count)
5336 .map(|di| {
5337 let flat = def_offset + di;
5338 let r = graph.def_ranks.get(flat).copied().unwrap_or(0.0);
5339 // Store the ORIGINAL ast_kind_priority in the tuple (used by the
5340 // per-tier attenuation loop below). The sort comparator uses
5341 // effective_priority (which may be higher due to 4.0.4 promotion)
5342 // to reorder hot defs ahead of cold type-tier defs, while the
5343 // attenuation tier tracker continues to use the original AST tier
5344 // so the existing per-tier cutoff behaviour is preserved.
5345 let kind_prio = ast_kind_priority(&file.defs[di].kind);
5346 let decl_order = file.defs[di].start_byte;
5347 (di, r, kind_prio, decl_order)
5348 })
5349 .collect();
5350 def_rank_pairs.sort_unstable_by(|a, b| {
5351 // Primary: effective priority (4.0.4: AST kind + corpus-rank promotion) descending.
5352 // Hot defs that exceed corpus-median thresholds are promoted above their
5353 // declared tier so they surface before cold type-tier defs.
5354 let eff_a = effective_priority(
5355 &file.defs[a.0].kind,
5356 a.1,
5357 promo_1_threshold,
5358 promo_2_threshold,
5359 );
5360 let eff_b = effective_priority(
5361 &file.defs[b.0].kind,
5362 b.1,
5363 promo_1_threshold,
5364 promo_2_threshold,
5365 );
5366 eff_b
5367 .cmp(&eff_a)
5368 // Secondary: def_rank descending within tier.
5369 .then_with(|| b.1.total_cmp(&a.1))
5370 // Tertiary: earlier declaration order (stable, deterministic).
5371 .then_with(|| a.3.cmp(&b.3))
5372 });
5373
5374 let top_def_rank = def_rank_pairs.first().map(|&(_, r, _, _)| r).unwrap_or(0.0);
5375
5376 let mut symbols: Vec<RepoMapSymbol> = Vec::new();
5377 let mut truncated_symbols: usize = 0;
5378
5379 // Track per-tier position for the attenuation cutoff. When AST kind
5380 // priority changes (we've moved from types to functions, say), reset
5381 // the position so the attenuation curve restarts. Otherwise a
5382 // structurally-equivalent-but-later tier would be unfairly cut.
5383 let mut tier_pos: usize = 0;
5384 let mut current_tier: Option<u32> = None;
5385 let mut tier_top_rank: f32 = top_def_rank;
5386
5387 for (pos, &(di, def_r, kind_prio, _)) in def_rank_pairs.iter().enumerate() {
5388 // Reset attenuation at tier boundaries.
5389 if current_tier != Some(kind_prio) {
5390 current_tier = Some(kind_prio);
5391 tier_pos = 0;
5392 tier_top_rank = def_r;
5393 }
5394
5395 // Logarithmic attenuation cutoff, relative to the tier's top rank.
5396 let cutoff = if tier_top_rank > 0.0 {
5397 tier_top_rank / (1.0 + (tier_pos as f32 + 1.0).ln())
5398 } else {
5399 0.0
5400 };
5401 if def_r < cutoff {
5402 // Attenuation cuts the rest of THIS tier; we don't stop
5403 // entirely because the next tier may still have useful
5404 // content within its own attenuation curve. Skip this def.
5405 truncated_symbols += 1;
5406 tier_pos += 1;
5407 continue;
5408 }
5409
5410 let def = &file.defs[di];
5411 let sym_bytes = estimate_symbol_bytes(&def.name);
5412 // Use the reserved symbols sub-budget (Fix C) so calls aren't
5413 // starved when symbols would otherwise saturate budget_in.
5414 if used + sym_bytes > symbols_budget {
5415 truncated_symbols += def_rank_pairs.len() - pos;
5416 break;
5417 }
5418
5419 // C2 (4.1.1): Use the AST-computed lsp_kind_hint when available
5420 // (populated at parse time for Python decorated_definition nodes).
5421 // Fall back to the AST-less string mapping for all other kinds.
5422 let kind = def
5423 .lsp_kind_hint
5424 .unwrap_or_else(|| crate::languages::lsp_symbol_kind_for_node_kind(&def.kind));
5425 let line_0 = def.start_line.saturating_sub(1) as usize;
5426 symbols.push(RepoMapSymbol {
5427 name: def.name.clone(),
5428 kind,
5429 lsp_location: RepoMapLspLocation {
5430 file_path: file_path_lsp.file_path.clone(),
5431 start_line: line_0,
5432 start_character: 0,
5433 end_line: line_0,
5434 end_character: 0,
5435 },
5436 rank: def_r,
5437 });
5438 used += sym_bytes;
5439 tier_pos += 1;
5440 }
5441
5442 // ── Calls ─────────────────────────────────────────────────────
5443 // `callee_indices` was pre-computed above (before the budget split) for
5444 // the precise-reserve calculation. Re-use it here.
5445 let call_total = callee_indices.len();
5446
5447 let mut calls: Vec<RepoMapCall> = Vec::new();
5448 let mut truncated_calls: usize = 0;
5449
5450 for (pos, &ci) in callee_indices.iter().enumerate() {
5451 let callee_rank = graph.base_ranks.get(ci).copied().unwrap_or(0.0);
5452
5453 // Hard render-time cap (I#68, 4.1.4): stop once we have rendered
5454 // MAX_FILE_CALLS entries. This is symmetric with the MAX_NEIGHBORS
5455 // graph-build cap (I#60) and replaces the old logarithmic attenuation
5456 // cutoff. Attenuation was correct for *symbols* (rank distributions
5457 // are informative) but pathological for *call edges*: in real corpora
5458 // callee base-ranks follow a geometric distribution, causing attenuation
5459 // to fire at pos=1 and collapse calls[] to a single entry.
5460 if calls.len() >= MAX_FILE_CALLS {
5461 truncated_calls += call_total - pos;
5462 break;
5463 }
5464
5465 let callee_path = &graph.files[ci].path;
5466 let call_bytes = estimate_call_bytes(callee_path);
5467 if used + call_bytes > budget_in {
5468 truncated_calls += call_total - pos;
5469 break;
5470 }
5471
5472 calls.push(RepoMapCall {
5473 lsp_location: file_lsp_location(callee_path),
5474 rank: callee_rank,
5475 });
5476 used += call_bytes;
5477 }
5478
5479 // Carry unused bytes forward to the next file.
5480 leftover = budget_in.saturating_sub(used);
5481
5482 result_files.push(RepoMapFile {
5483 lsp_location: file_path_lsp,
5484 rank: file_rank,
5485 content_kind: content_kind_tag(content_kind_for_path(&file.path)),
5486 calls,
5487 symbols,
5488 truncated_symbols,
5489 truncated_calls,
5490 });
5491 }
5492
5493 let estimated_bytes = serde_json::to_string(&result_files)
5494 .map(|s| s.len())
5495 .unwrap_or(0);
5496
5497 let budget_exhausted = total_files > result_files.len();
5498
5499 GetRepoMapResponse {
5500 files: result_files,
5501 total_files,
5502 estimated_bytes,
5503 budget_bytes: budget_total_bytes,
5504 budget_exhausted,
5505 capped: budget_exhausted,
5506 }
5507}
5508
5509/// Render a `PageRank`-sorted JSON map of the repository (4.0.0 compatibility shim).
5510///
5511/// This function wraps [`render_json_budgeted`] with a synthetic token budget
5512/// derived from `max_files * 2000` (a generous per-file allowance). It exists
5513/// to keep the existing D1/D2 unit tests compiling without change; the MCP
5514/// layer calls [`render_json_budgeted`] directly in 4.0.1.
5515///
5516/// The `capped` field in the response reflects whether the budget was
5517/// exhausted before all `eligible` files were included, which is equivalent
5518/// to the previous `total_files > max_files` check.
5519///
5520/// When `include_metadata` is `false` (default), files whose extension
5521/// classifies as [`ContentKind::Meta`] are excluded before ranking.
5522#[must_use]
5523pub fn render_json(
5524 graph: &RepoGraph,
5525 max_files: usize,
5526 focus: Option<usize>,
5527 include_metadata: bool,
5528) -> GetRepoMapResponse {
5529 // Synthesise a generous token budget: 2000 tokens per requested file.
5530 // This ensures the existing D1/D2 tests (which pass small max_files values
5531 // like 3, 5, 50) see the same file-count behaviour they expect. The test
5532 // assertions check file counts, not byte sizes, so the exact budget value
5533 // only matters for ensuring enough headroom.
5534 let token_budget = max_files.saturating_mul(2000);
5535 render_json_budgeted(graph, token_budget, focus, include_metadata)
5536}
5537
5538// ── Tests ────────────────────────────────────────────────────────────
5539
5540#[cfg(test)]
5541mod tests {
5542 use super::*;
5543
5544 #[test]
5545 fn test_pagerank_simple() {
5546 // 3-node graph: 0 -> 1 -> 2, 2 -> 0 (cycle)
5547 let edges = vec![(0, 1, 1), (1, 2, 1), (2, 0, 1)];
5548 let ranks = pagerank(3, &edges, None);
5549
5550 // All nodes in a symmetric cycle should have equal rank
5551 assert_eq!(ranks.len(), 3);
5552 let sum: f32 = ranks.iter().sum();
5553 assert!(
5554 (sum - 1.0).abs() < 0.01,
5555 "ranks should sum to ~1.0, got {sum}"
5556 );
5557
5558 // In a perfect cycle, all ranks should be approximately equal
5559 let expected = 1.0 / 3.0;
5560 for (i, &r) in ranks.iter().enumerate() {
5561 assert!(
5562 (r - expected).abs() < 0.05,
5563 "rank[{i}] = {r}, expected ~{expected}"
5564 );
5565 }
5566 }
5567
5568 #[test]
5569 fn test_pagerank_star() {
5570 // Star graph: 0,1,2 all point to 3
5571 let edges = vec![(0, 3, 1), (1, 3, 1), (2, 3, 1)];
5572 let ranks = pagerank(4, &edges, None);
5573
5574 assert_eq!(ranks.len(), 4);
5575 // Node 3 should have the highest rank
5576 let max_idx = ranks
5577 .iter()
5578 .enumerate()
5579 .max_by(|a, b| a.1.total_cmp(b.1))
5580 .unwrap()
5581 .0;
5582 assert_eq!(max_idx, 3, "node 3 should have highest rank");
5583 assert!(
5584 ranks[3] > ranks[0],
5585 "rank[3]={} should be > rank[0]={}",
5586 ranks[3],
5587 ranks[0]
5588 );
5589 }
5590
5591 #[test]
5592 fn test_pagerank_topic_sensitive() {
5593 // 10-node chain: 0 -> 1 -> ... -> 9.
5594 //
5595 // With PERSONALIZATION_ALPHA = 0.15 and n = 10, the uniform share per
5596 // node is 1/10 = 0.10. The focus node (0) gets 0.15 teleportation
5597 // mass vs 0.10 uniform, so focused rank[0] > uniform rank[0] holds.
5598 //
5599 // The 3-node chain used previously broke when alpha was reduced from
5600 // 0.70 to 0.15 because 0.15 < 1/3 = 0.33 for n=3 — the focus node
5601 // received *less* teleportation than its uniform share, inverting the
5602 // expected direction. Using n=10 avoids this edge case while still
5603 // testing the personalization effect.
5604 let n = 10_usize;
5605 #[expect(clippy::cast_possible_truncation, reason = "test: n << u32::MAX")]
5606 let edges: Vec<(u32, u32, u32)> = (0..(n - 1))
5607 .map(|i| (i as u32, (i + 1) as u32, 1_u32))
5608 .collect();
5609 let uniform_ranks = pagerank(n, &edges, None);
5610 let biased_ranks = pagerank(n, &edges, Some(0));
5611
5612 // With focus on node 0, it should get a higher rank than uniform
5613 // because PERSONALIZATION_ALPHA (0.15) > 1/n (0.10) for n=10.
5614 assert!(
5615 biased_ranks[0] > uniform_ranks[0],
5616 "focused rank[0]={} should be > uniform rank[0]={}",
5617 biased_ranks[0],
5618 uniform_ranks[0]
5619 );
5620 }
5621
5622 // ── J1 tests — topic-sensitive PageRank soft personalization ─────────
5623
5624 /// J1 RED: `focus_file` PageRank must not collapse other-file ranks.
5625 ///
5626 /// Baseline (pre-4.0.5) concentrated 70% mass on the focus node, producing
5627 /// a degenerate Dirac delta: focus rank ≈ 0.703, all others ≈ 0.003.
5628 /// This test fails on the baseline and must pass after the fix.
5629 ///
5630 /// Invariant: with `PERSONALIZATION_ALPHA = 0.15`, focus node gets 0.15 of
5631 /// teleportation mass and each of the other (n-1) nodes gets 0.85/(n-1).
5632 /// On a star graph with n=10 nodes, the focus node rank must NOT be more
5633 /// than 40× the average non-focus rank. The 4.0.5 fix targets roughly
5634 /// 5-10× for a well-connected graph, so 40× is a conservative upper bound
5635 /// that the baseline (≈200×) fails.
5636 #[test]
5637 fn test_focus_file_topic_pagerank_preserves_rank_dispersion() {
5638 // Star graph: nodes 1..9 all point to node 0 (high natural rank).
5639 // Focus on node 1 (low natural rank) to test personalization effect.
5640 let n = 10_usize;
5641 #[expect(clippy::cast_possible_truncation, reason = "test: n << u32::MAX")]
5642 let edges: Vec<(u32, u32, u32)> = (1..n).map(|i| (i as u32, 0_u32, 1_u32)).collect();
5643
5644 let ranks_focused = pagerank(n, &edges, Some(1));
5645
5646 let focus_rank = ranks_focused[1];
5647 let sum_non_focus: f32 = ranks_focused
5648 .iter()
5649 .enumerate()
5650 .filter(|&(i, _)| i != 1)
5651 .map(|(_, &r)| r)
5652 .sum();
5653 let n_non_focus = (n - 1) as f32;
5654 let avg_non_focus = sum_non_focus / n_non_focus;
5655
5656 let dispersion_ratio = focus_rank / avg_non_focus;
5657
5658 eprintln!(
5659 "J1 dispersion: focus_rank={focus_rank:.6}, avg_non_focus={avg_non_focus:.6}, \
5660 ratio={dispersion_ratio:.2}× (must be <= 40×)"
5661 );
5662
5663 // With 0.15 personalization alpha the focus node's teleportation
5664 // advantage is modest; 40× is an upper bound the old 0.70 code violates.
5665 assert!(
5666 dispersion_ratio <= 40.0,
5667 "focus rank is {dispersion_ratio:.1}× avg non-focus rank (must be ≤ 40×); \
5668 pre-fix baseline was ~200× due to 70% concentration — I#16"
5669 );
5670
5671 // Ranks must still sum to ~1.
5672 let total: f32 = ranks_focused.iter().sum();
5673 assert!(
5674 (total - 1.0).abs() < 0.01,
5675 "ranks must sum to ≈1.0; got {total}"
5676 );
5677 }
5678
5679 /// J1 RED: focus node must have the highest rank (it still gets the bias),
5680 /// but non-focus nodes must NOT collapse to a flat floor.
5681 ///
5682 /// Concretely: the second-highest-ranked file must be ≥ 10% of the focus
5683 /// file's rank (neighborhood rebiasing, not winner-take-all).
5684 #[test]
5685 fn test_focus_file_topic_pagerank_does_not_collapse_other_files() {
5686 // Linear chain: 0 → 1 → 2 → ... → 9 (directed).
5687 // Focus on node 0. Without personalization, ranks decrease along the
5688 // chain. With soft personalization the non-focus nodes stay non-trivial.
5689 let n = 10_usize;
5690 #[expect(clippy::cast_possible_truncation, reason = "test: n << u32::MAX")]
5691 let edges: Vec<(u32, u32, u32)> = (0..(n - 1))
5692 .map(|i| (i as u32, (i + 1) as u32, 1_u32))
5693 .collect();
5694
5695 let ranks = pagerank(n, &edges, Some(0));
5696
5697 let focus_rank = ranks[0];
5698 // All non-focus ranks must be ≥ 10% of focus rank.
5699 for (i, &r) in ranks.iter().enumerate().skip(1) {
5700 assert!(
5701 r >= focus_rank * 0.10,
5702 "rank[{i}] = {r:.6} is < 10% of focus rank {focus_rank:.6}; \
5703 non-focus files must not collapse to near-zero (I#16)"
5704 );
5705 }
5706 }
5707
5708 // ── J2 tests — neighborhood count parity ─────────────────────────────
5709
5710 /// J2 RED: `render_json_budgeted` with `focus=Some(i)` must return at
5711 /// least 70% as many files as the unfocused call with the same budget.
5712 ///
5713 /// Baseline (pre-4.0.5, α=0.70) collapsed the focused run to 1 dominant
5714 /// file + a flat tail — that's what this test was originally written to
5715 /// catch. The threshold was 80% when α was 0.15; raising α to 0.35
5716 /// (4.1.12, to satisfy the flask focus-in-top-3 invariant) pulls budget
5717 /// toward the focus neighborhood and drops file count to ~70% on small
5718 /// synthetic stars. 70% remains far from the "1 file + zero tail"
5719 /// degenerate baseline this test guards against.
5720 #[test]
5721 fn test_focus_file_returns_neighborhood_not_just_focus() {
5722 // Build a 12-file star graph with meaningful rank variation.
5723 let n = 12_usize;
5724 #[expect(clippy::cast_possible_truncation, reason = "test: n << u32::MAX")]
5725 let edges: Vec<(u32, u32, u32)> = (1..n).map(|i| (i as u32, 0_u32, 1_u32)).collect();
5726 let base_ranks = pagerank(n, &edges, None);
5727 let (callers, callees) = build_neighbor_lists(n, &edges);
5728
5729 let file_nodes: Vec<FileNode> = (0..n)
5730 .map(|i| FileNode {
5731 path: format!("src/file_{i}.rs"),
5732 defs: vec![Definition {
5733 name: format!("func_{i}"),
5734 kind: "function_item".to_string(),
5735 start_line: 1,
5736 end_line: 5,
5737 scope: String::new(),
5738 signature: Some(format!("fn func_{i}() -> i32")),
5739 start_byte: 0,
5740 end_byte: 100,
5741 calls: vec![],
5742 decorator: None,
5743 lsp_kind_hint: None,
5744 }],
5745 imports: vec![],
5746 })
5747 .collect();
5748
5749 let graph = RepoGraph {
5750 files: file_nodes,
5751 edges,
5752 base_ranks,
5753 callers,
5754 callees,
5755 def_edges: vec![],
5756 def_ranks: vec![],
5757 def_callers: vec![],
5758 def_callees: vec![],
5759 def_offsets: vec![0],
5760 alpha: 0.5,
5761 };
5762
5763 let budget = 2000; // generous budget; all 12 files should fit
5764 let unfocused = render_json_budgeted(&graph, budget, None, false);
5765 let focused = render_json_budgeted(&graph, budget, Some(1), false);
5766
5767 let unfocused_n = unfocused.files.len();
5768 let focused_n = focused.files.len();
5769 #[expect(
5770 clippy::cast_possible_truncation,
5771 clippy::cast_sign_loss,
5772 reason = "unfocused_n is a file count (small, positive); f32 multiplication \
5773 by 0.80 and ceil produce a value in [0, n]; truncation to usize is safe"
5774 )]
5775 let min_expected = (unfocused_n as f32 * 0.70).ceil() as usize;
5776
5777 eprintln!(
5778 "J2 neighborhood: unfocused={unfocused_n} files, focused={focused_n} files \
5779 (need ≥ {min_expected})"
5780 );
5781
5782 assert!(
5783 focused_n >= min_expected,
5784 "focused call returned {focused_n} files; expected ≥ {min_expected} \
5785 (70% of unfocused {unfocused_n}); soft personalization must preserve \
5786 rank dispersion across files (I#16/J2)"
5787 );
5788 }
5789
5790 /// J2 RED: topic delta fingerprinting — focused run must reorder files
5791 /// relative to unfocused run (focus file surfaces near top), but both
5792 /// must contain similar total file counts.
5793 #[test]
5794 fn test_focus_delta_topic_fingerprinting_works() {
5795 // Bidirectional 8-file ring so all nodes are structurally equivalent.
5796 // Without focus all ranks are equal. With focus on node 3, node 3
5797 // must surface as the highest-ranked file.
5798 let n = 8_usize;
5799 #[expect(clippy::cast_possible_truncation, reason = "test: n << u32::MAX")]
5800 let edges: Vec<(u32, u32, u32)> = (0..n)
5801 .flat_map(|i| {
5802 let next = ((i + 1) % n) as u32;
5803 let curr = i as u32;
5804 [(curr, next, 1_u32), (next, curr, 1_u32)]
5805 })
5806 .collect();
5807
5808 let ranks_uniform = pagerank(n, &edges, None);
5809 let ranks_focused = pagerank(n, &edges, Some(3));
5810
5811 // Focus node must have highest rank.
5812 let top_idx = ranks_focused
5813 .iter()
5814 .enumerate()
5815 .max_by(|a, b| a.1.total_cmp(b.1))
5816 .map(|(i, _)| i)
5817 .unwrap();
5818
5819 assert_eq!(
5820 top_idx, 3,
5821 "with focus=Some(3), node 3 must have highest rank; top was {top_idx}"
5822 );
5823
5824 // Uniform baseline: all ranks should be approximately equal.
5825 let uniform_max = ranks_uniform
5826 .iter()
5827 .copied()
5828 .fold(f32::NEG_INFINITY, f32::max);
5829 let uniform_min = ranks_uniform.iter().copied().fold(f32::INFINITY, f32::min);
5830 assert!(
5831 (uniform_max - uniform_min).abs() < 0.01,
5832 "on a ring without focus all ranks should be ≈equal; max={uniform_max:.6} min={uniform_min:.6}"
5833 );
5834
5835 // Focused run must rank the focus node significantly higher than others
5836 // but others must remain non-trivial (≥ 5% of focus).
5837 let focus_rank = ranks_focused[3];
5838 for (i, &r) in ranks_focused.iter().enumerate().filter(|&(i, _)| i != 3) {
5839 assert!(
5840 r >= focus_rank * 0.05,
5841 "rank[{i}]={r:.6} is < 5% of focus rank {focus_rank:.6}; \
5842 soft personalization must preserve non-focus ranks"
5843 );
5844 }
5845 }
5846
5847 // ── T1 tests — focus_file resolver normalization (I#20) ──────────────
5848
5849 /// Build a tiny synthetic graph whose `FileNode::path` values match the
5850 /// shape `build_graph` produces on disk (no leading `./`, forward slashes).
5851 fn focus_resolver_graph() -> RepoGraph {
5852 let file_nodes: Vec<FileNode> = vec![
5853 FileNode {
5854 path: "device_opt/services/storage.py".to_string(),
5855 defs: vec![],
5856 imports: vec![],
5857 },
5858 FileNode {
5859 path: "device_opt/ui/textual/screens/settings.py".to_string(),
5860 defs: vec![],
5861 imports: vec![],
5862 },
5863 FileNode {
5864 path: "device_opt/services/registry.py".to_string(),
5865 defs: vec![],
5866 imports: vec![],
5867 },
5868 FileNode {
5869 path: "tests/test_storage.py".to_string(),
5870 defs: vec![],
5871 imports: vec![],
5872 },
5873 ];
5874 let n = file_nodes.len();
5875 RepoGraph {
5876 files: file_nodes,
5877 edges: vec![],
5878 base_ranks: vec![1.0 / n as f32; n],
5879 callers: vec![vec![]; n],
5880 callees: vec![vec![]; n],
5881 def_edges: vec![],
5882 def_ranks: vec![],
5883 def_callers: vec![],
5884 def_callees: vec![],
5885 def_offsets: vec![0; n + 1],
5886 alpha: 0.5,
5887 }
5888 }
5889
5890 /// T1: focus_file paths emitted by `lsp_location.file_path` (with the
5891 /// `./` prefix) must resolve to the correct file index.
5892 ///
5893 /// Baseline reproduction (mnemosyne corpus, 4.0.5): passing
5894 /// `focus_file="./device_opt/.../settings.py"` produced rank values
5895 /// bit-identical to the unfocused call because the strict-suffix matcher
5896 /// in `tools.rs` failed both the `exact` and the `strict_suffix` checks
5897 /// when the focus carried the LSP `./` prefix. The matcher silently
5898 /// returned `focus = None`, masking the failure as "topic-sensitive
5899 /// PageRank does nothing on Python".
5900 #[test]
5901 fn test_focus_file_resolver_accepts_lsp_location_path() {
5902 let g = focus_resolver_graph();
5903 // LSP-shaped path with leading `./` — the form documented in
5904 // get_repo_map's instructions.
5905 let res = g.resolve_focus_file("./device_opt/ui/textual/screens/settings.py");
5906 match res {
5907 FocusResolution::Found(idx) => {
5908 assert_eq!(
5909 g.files[idx].path, "device_opt/ui/textual/screens/settings.py",
5910 "resolver must accept the ./-prefixed LSP path form (I#20)"
5911 );
5912 }
5913 FocusResolution::NotFound | FocusResolution::Ambiguous(_) => {
5914 panic!(
5915 "resolver returned {res:?} for ./device_opt/ui/textual/screens/settings.py; \
5916 the LSP-shaped path form must resolve to exactly one file (I#20)"
5917 );
5918 }
5919 }
5920 }
5921
5922 /// T1: the bare stored path (no `./`) must continue to resolve.
5923 /// Regression guard for the pre-fix matcher's "exact" path.
5924 #[test]
5925 fn test_focus_file_resolver_accepts_bare_stored_path() {
5926 let g = focus_resolver_graph();
5927 let res = g.resolve_focus_file("device_opt/services/storage.py");
5928 match res {
5929 FocusResolution::Found(idx) => {
5930 assert_eq!(g.files[idx].path, "device_opt/services/storage.py");
5931 }
5932 other => panic!("expected Found, got {other:?}"),
5933 }
5934 }
5935
5936 /// T1: strict-suffix match — `storage.py` must match
5937 /// `device_opt/services/storage.py` (prev char is `/`) but ambiguity
5938 /// (two `storage.py` files) must be reported, not silently picked.
5939 #[test]
5940 fn test_focus_file_resolver_strict_suffix_and_ambiguity() {
5941 let g = focus_resolver_graph();
5942 // "storage.py" matches both device_opt/services/storage.py and
5943 // tests/test_storage.py? No — test_storage.py has `_` before `storage.py`
5944 // (not `/`), so the strict-suffix matcher rejects it. Only one match.
5945 let res = g.resolve_focus_file("storage.py");
5946 assert!(
5947 matches!(res, FocusResolution::Found(_)),
5948 "strict-suffix `storage.py` must match exactly one file (the `_` in \
5949 test_storage.py blocks the strict-suffix), got {res:?}"
5950 );
5951 // Add a second services/storage.py-shaped file to force ambiguity.
5952 let mut g2 = g.clone();
5953 g2.files.push(FileNode {
5954 path: "vendored/services/storage.py".to_string(),
5955 defs: vec![],
5956 imports: vec![],
5957 });
5958 g2.base_ranks.push(0.0);
5959 g2.callers.push(vec![]);
5960 g2.callees.push(vec![]);
5961 g2.def_offsets.push(*g2.def_offsets.last().unwrap());
5962 let res = g2.resolve_focus_file("storage.py");
5963 match res {
5964 FocusResolution::Ambiguous(cands) => {
5965 assert_eq!(cands.len(), 2, "expected two candidates, got {cands:?}");
5966 }
5967 other => panic!("expected Ambiguous, got {other:?}"),
5968 }
5969 }
5970
5971 /// T1: a focus that matches no file returns `NotFound`. The caller
5972 /// is responsible for either treating this as unfocused or surfacing
5973 /// an error — the resolver itself does not impose policy.
5974 #[test]
5975 fn test_focus_file_resolver_not_found() {
5976 let g = focus_resolver_graph();
5977 let res = g.resolve_focus_file("./does/not/exist.py");
5978 assert!(
5979 matches!(res, FocusResolution::NotFound),
5980 "expected NotFound, got {res:?}"
5981 );
5982 }
5983
5984 /// T1: empty focus does not match anything (avoids the empty-suffix
5985 /// degenerate that would otherwise match every file).
5986 #[test]
5987 fn test_focus_file_resolver_empty_input_is_not_found() {
5988 let g = focus_resolver_graph();
5989 let res = g.resolve_focus_file("");
5990 assert!(
5991 matches!(res, FocusResolution::NotFound),
5992 "empty focus must not match anything, got {res:?}"
5993 );
5994 }
5995
5996 /// T1: focus_file rank delta must be visible on a Python-shaped
5997 /// synthetic graph.
5998 ///
5999 /// Builds a small Python-style call graph (FileNode + Definition with
6000 /// resolved CallRefs, matching what `extract_calls` produces on a real
6001 /// Python corpus), runs `build_graph_from_files_pub` to get a
6002 /// `RepoGraph`, then calls `render_json_budgeted` with and without
6003 /// focus. Asserts that the focused call changes the rank of at least
6004 /// one non-focus file by ≥ 5% in either direction.
6005 ///
6006 /// On the baseline (pre-T1) this test passes when the caller supplies
6007 /// an int focus_idx directly — the engine's topic-sensitive PageRank is
6008 /// correct. The bug was at the string-to-int resolver layer in
6009 /// `tools.rs`, which silently masked the failure as "the rendering
6010 /// path doesn't propagate focus". This test locks the engine's
6011 /// behavior so a future regression in the rendering path is caught.
6012 #[test]
6013 #[expect(
6014 clippy::too_many_lines,
6015 reason = "synthetic Python-shaped graph (five FileNodes with defs + \
6016 CallRefs + ImportRefs) plus the two-call assertion sequence \
6017 is inherently long; splitting into helpers would obscure the \
6018 one-shot reproduction the test is locking in."
6019 )]
6020 fn test_focus_file_rank_delta_visible_on_python_corpus() {
6021 // Five files, Python-shaped: services/storage.py (a "hub" that two
6022 // UI files call into) plus a tests/ file. The Python tree-sitter
6023 // extractor produces `class_definition` and `function_definition`
6024 // kinds with resolved CallRefs pointing at the hub.
6025 let mut files: Vec<FileNode> = vec![
6026 FileNode {
6027 path: "device_opt/services/storage.py".to_string(),
6028 defs: vec![
6029 Definition {
6030 name: "ScanStore".to_string(),
6031 kind: "class_definition".to_string(),
6032 start_line: 1,
6033 end_line: 80,
6034 scope: String::new(),
6035 signature: None,
6036 start_byte: 0,
6037 end_byte: 2000,
6038 calls: vec![],
6039 decorator: None,
6040 lsp_kind_hint: None,
6041 },
6042 Definition {
6043 name: "save_scan".to_string(),
6044 kind: "function_definition".to_string(),
6045 start_line: 20,
6046 end_line: 40,
6047 scope: "class_definition ScanStore".to_string(),
6048 signature: Some("def save_scan(self, scan)".to_string()),
6049 start_byte: 200,
6050 end_byte: 600,
6051 calls: vec![],
6052 decorator: None,
6053 lsp_kind_hint: None,
6054 },
6055 ],
6056 imports: vec![],
6057 },
6058 FileNode {
6059 path: "device_opt/services/registry.py".to_string(),
6060 defs: vec![Definition {
6061 name: "register".to_string(),
6062 kind: "function_definition".to_string(),
6063 start_line: 1,
6064 end_line: 30,
6065 scope: String::new(),
6066 signature: Some("def register(svc)".to_string()),
6067 start_byte: 0,
6068 end_byte: 600,
6069 calls: vec![CallRef {
6070 name: "save_scan".to_string(),
6071 qualified_path: None,
6072 receiver_type: None,
6073 byte_offset: 100,
6074 resolved: None,
6075 }],
6076 decorator: None,
6077 lsp_kind_hint: None,
6078 }],
6079 imports: vec![ImportRef {
6080 raw_path: "from device_opt.services import storage".to_string(),
6081 resolved_idx: Some(0),
6082 }],
6083 },
6084 FileNode {
6085 path: "device_opt/ui/screens/browse.py".to_string(),
6086 defs: vec![Definition {
6087 name: "browse_scans".to_string(),
6088 kind: "function_definition".to_string(),
6089 start_line: 1,
6090 end_line: 50,
6091 scope: String::new(),
6092 signature: Some("def browse_scans(app)".to_string()),
6093 start_byte: 0,
6094 end_byte: 1000,
6095 calls: vec![CallRef {
6096 name: "save_scan".to_string(),
6097 qualified_path: None,
6098 receiver_type: None,
6099 byte_offset: 200,
6100 resolved: None,
6101 }],
6102 decorator: None,
6103 lsp_kind_hint: None,
6104 }],
6105 imports: vec![ImportRef {
6106 raw_path: "from device_opt.services import storage".to_string(),
6107 resolved_idx: Some(0),
6108 }],
6109 },
6110 FileNode {
6111 path: "device_opt/ui/screens/settings.py".to_string(),
6112 defs: vec![Definition {
6113 name: "open_settings".to_string(),
6114 kind: "function_definition".to_string(),
6115 start_line: 1,
6116 end_line: 40,
6117 scope: String::new(),
6118 signature: Some("def open_settings(app)".to_string()),
6119 start_byte: 0,
6120 end_byte: 800,
6121 calls: vec![CallRef {
6122 name: "register".to_string(),
6123 qualified_path: None,
6124 receiver_type: None,
6125 byte_offset: 150,
6126 resolved: None,
6127 }],
6128 decorator: None,
6129 lsp_kind_hint: None,
6130 }],
6131 imports: vec![ImportRef {
6132 raw_path: "from device_opt.services import registry".to_string(),
6133 resolved_idx: Some(1),
6134 }],
6135 },
6136 FileNode {
6137 path: "tests/test_storage.py".to_string(),
6138 defs: vec![Definition {
6139 name: "test_save".to_string(),
6140 kind: "function_definition".to_string(),
6141 start_line: 1,
6142 end_line: 20,
6143 scope: String::new(),
6144 signature: Some("def test_save()".to_string()),
6145 start_byte: 0,
6146 end_byte: 400,
6147 calls: vec![CallRef {
6148 name: "save_scan".to_string(),
6149 qualified_path: None,
6150 receiver_type: None,
6151 byte_offset: 50,
6152 resolved: None,
6153 }],
6154 decorator: None,
6155 lsp_kind_hint: None,
6156 }],
6157 imports: vec![ImportRef {
6158 raw_path: "from device_opt.services import storage".to_string(),
6159 resolved_idx: Some(0),
6160 }],
6161 },
6162 ];
6163
6164 // Resolve calls so the graph builder has edges to chew on.
6165 let def_index = build_def_index(&files);
6166 resolve_calls(&mut files, &def_index, &HashMap::new());
6167 let graph = build_graph_from_files_pub(files);
6168
6169 // Sanity: the graph must have edges (the calls were resolved).
6170 assert!(
6171 !graph.edges.is_empty(),
6172 "Python-shaped synthetic graph must produce file-level edges; got 0. \
6173 The CallRefs may have failed to resolve."
6174 );
6175
6176 // Resolve the focus file via the new helper.
6177 let focus_idx = match graph.resolve_focus_file("./device_opt/ui/screens/settings.py") {
6178 FocusResolution::Found(i) => i,
6179 other => panic!("resolver must find settings.py via LSP-shaped path, got {other:?}"),
6180 };
6181
6182 let budget = 4000;
6183 let unfocused = render_json_budgeted(&graph, budget, None, false);
6184 let focused = render_json_budgeted(&graph, budget, Some(focus_idx), false);
6185
6186 // Collect rank-by-path maps for both runs.
6187 let unfocused_ranks: std::collections::HashMap<String, f32> = unfocused
6188 .files
6189 .iter()
6190 .map(|f| (f.lsp_location.file_path.clone(), f.rank))
6191 .collect();
6192 let focused_ranks: std::collections::HashMap<String, f32> = focused
6193 .files
6194 .iter()
6195 .map(|f| (f.lsp_location.file_path.clone(), f.rank))
6196 .collect();
6197
6198 eprintln!("T1 Python — unfocused ranks: {unfocused_ranks:#?}");
6199 eprintln!("T1 Python — focused ranks: {focused_ranks:#?}");
6200
6201 // Find at least one non-focus file whose rank changed by ≥ 5% in
6202 // either direction. The threshold is conservative; the soft 0.15
6203 // personalization alpha redistributes mass enough that on this
6204 // 5-node graph the affected neighbors typically shift by 20%+.
6205 let focus_path = "./device_opt/ui/screens/settings.py";
6206 let mut max_delta_ratio = 0.0_f32;
6207 for (path, &u_rank) in &unfocused_ranks {
6208 if path == focus_path {
6209 continue;
6210 }
6211 if let Some(&f_rank) = focused_ranks.get(path)
6212 && u_rank > 0.0
6213 {
6214 let ratio = (f_rank - u_rank).abs() / u_rank;
6215 if ratio > max_delta_ratio {
6216 max_delta_ratio = ratio;
6217 }
6218 }
6219 }
6220 assert!(
6221 max_delta_ratio >= 0.05,
6222 "focus_file must rebias non-focus file ranks by ≥ 5%; \
6223 max observed delta ratio = {max_delta_ratio:.3} \
6224 (I#20: focus_file invisible on Python corpora)"
6225 );
6226
6227 // Bit-identity guard: at least one non-focus file's rank must NOT
6228 // equal its unfocused value. This is the pathology from the
6229 // mnemosyne reproduction: every rank value was bit-identical
6230 // across global/focused calls.
6231 let any_changed = unfocused_ranks.iter().any(|(path, &u_rank)| {
6232 path != focus_path
6233 && focused_ranks
6234 .get(path)
6235 .is_some_and(|&f_rank| f_rank.to_bits() != u_rank.to_bits())
6236 });
6237 assert!(
6238 any_changed,
6239 "no non-focus file rank changed across focused/unfocused calls — \
6240 bit-identical pathology (I#20). unfocused={unfocused_ranks:#?} \
6241 focused={focused_ranks:#?}"
6242 );
6243 }
6244
6245 /// T1: focus_file rank delta on a Rust-shaped synthetic graph.
6246 ///
6247 /// Regression test: confirms the engine's topic-sensitive PageRank
6248 /// works on Rust shapes (where T1's investigation found it already
6249 /// works, but the resolver fix must not break the existing path).
6250 ///
6251 /// This complements `test_focus_file_returns_neighborhood_not_just_focus`
6252 /// by additionally checking that (a) the resolver accepts a Rust path
6253 /// with the `./` LSP prefix, and (b) at least one non-focus file's
6254 /// rank moves by ≥ 5%.
6255 #[test]
6256 #[expect(
6257 clippy::too_many_lines,
6258 reason = "synthetic Rust-shaped graph with four FileNodes plus the \
6259 two-call assertion sequence inherently exceeds the 100-line \
6260 cap; the test mirrors the Python-shaped sibling."
6261 )]
6262 fn test_focus_file_rank_delta_preserved_on_rust_corpus() {
6263 let mut files: Vec<FileNode> = vec![
6264 FileNode {
6265 path: "src/lib.rs".to_string(),
6266 defs: vec![Definition {
6267 name: "Engine".to_string(),
6268 kind: "struct_item".to_string(),
6269 start_line: 1,
6270 end_line: 30,
6271 scope: String::new(),
6272 signature: None,
6273 start_byte: 0,
6274 end_byte: 600,
6275 calls: vec![],
6276 decorator: None,
6277 lsp_kind_hint: None,
6278 }],
6279 imports: vec![],
6280 },
6281 FileNode {
6282 path: "src/encoder/mod.rs".to_string(),
6283 defs: vec![Definition {
6284 name: "encode".to_string(),
6285 kind: "function_item".to_string(),
6286 start_line: 1,
6287 end_line: 40,
6288 scope: String::new(),
6289 signature: Some("fn encode(input: &str) -> Vec<f32>".to_string()),
6290 start_byte: 0,
6291 end_byte: 800,
6292 calls: vec![],
6293 decorator: None,
6294 lsp_kind_hint: None,
6295 }],
6296 imports: vec![ImportRef {
6297 raw_path: "use crate::lib;".to_string(),
6298 resolved_idx: Some(0),
6299 }],
6300 },
6301 FileNode {
6302 path: "src/search.rs".to_string(),
6303 defs: vec![Definition {
6304 name: "search".to_string(),
6305 kind: "function_item".to_string(),
6306 start_line: 1,
6307 end_line: 30,
6308 scope: String::new(),
6309 signature: Some("fn search(q: &str) -> Hits".to_string()),
6310 start_byte: 0,
6311 end_byte: 600,
6312 calls: vec![CallRef {
6313 name: "encode".to_string(),
6314 qualified_path: None,
6315 receiver_type: None,
6316 byte_offset: 100,
6317 resolved: None,
6318 }],
6319 decorator: None,
6320 lsp_kind_hint: None,
6321 }],
6322 imports: vec![ImportRef {
6323 raw_path: "use crate::encoder;".to_string(),
6324 resolved_idx: Some(1),
6325 }],
6326 },
6327 FileNode {
6328 path: "src/cli.rs".to_string(),
6329 defs: vec![Definition {
6330 name: "main".to_string(),
6331 kind: "function_item".to_string(),
6332 start_line: 1,
6333 end_line: 20,
6334 scope: String::new(),
6335 signature: Some("fn main()".to_string()),
6336 start_byte: 0,
6337 end_byte: 400,
6338 calls: vec![CallRef {
6339 name: "search".to_string(),
6340 qualified_path: None,
6341 receiver_type: None,
6342 byte_offset: 50,
6343 resolved: None,
6344 }],
6345 decorator: None,
6346 lsp_kind_hint: None,
6347 }],
6348 imports: vec![ImportRef {
6349 raw_path: "use crate::search;".to_string(),
6350 resolved_idx: Some(2),
6351 }],
6352 },
6353 ];
6354
6355 let def_index = build_def_index(&files);
6356 resolve_calls(&mut files, &def_index, &HashMap::new());
6357 let graph = build_graph_from_files_pub(files);
6358
6359 assert!(
6360 !graph.edges.is_empty(),
6361 "Rust-shaped synthetic graph must produce edges"
6362 );
6363
6364 let focus_idx = match graph.resolve_focus_file("./src/encoder/mod.rs") {
6365 FocusResolution::Found(i) => i,
6366 other => panic!("resolver must find encoder/mod.rs via LSP path, got {other:?}"),
6367 };
6368
6369 let budget = 4000;
6370 let unfocused = render_json_budgeted(&graph, budget, None, false);
6371 let focused = render_json_budgeted(&graph, budget, Some(focus_idx), false);
6372
6373 let unfocused_ranks: std::collections::HashMap<String, f32> = unfocused
6374 .files
6375 .iter()
6376 .map(|f| (f.lsp_location.file_path.clone(), f.rank))
6377 .collect();
6378 let focused_ranks: std::collections::HashMap<String, f32> = focused
6379 .files
6380 .iter()
6381 .map(|f| (f.lsp_location.file_path.clone(), f.rank))
6382 .collect();
6383
6384 eprintln!("T1 Rust — unfocused: {unfocused_ranks:#?}");
6385 eprintln!("T1 Rust — focused: {focused_ranks:#?}");
6386
6387 let focus_path = "./src/encoder/mod.rs";
6388 let mut max_delta_ratio = 0.0_f32;
6389 for (path, &u_rank) in &unfocused_ranks {
6390 if path == focus_path {
6391 continue;
6392 }
6393 if let Some(&f_rank) = focused_ranks.get(path)
6394 && u_rank > 0.0
6395 {
6396 let ratio = (f_rank - u_rank).abs() / u_rank;
6397 if ratio > max_delta_ratio {
6398 max_delta_ratio = ratio;
6399 }
6400 }
6401 }
6402 assert!(
6403 max_delta_ratio >= 0.05,
6404 "focus_file must rebias non-focus file ranks by ≥ 5% on Rust shapes; \
6405 max observed delta = {max_delta_ratio:.3}"
6406 );
6407 }
6408
6409 #[test]
6410 fn test_pagerank_empty() {
6411 let ranks = pagerank(0, &[], None);
6412 assert!(ranks.is_empty());
6413 }
6414
6415 #[test]
6416 fn test_render_tiers() {
6417 // Build a small graph with 10 files to exercise all tiers
6418 let files: Vec<FileNode> = (0..10)
6419 .map(|i| FileNode {
6420 path: format!("src/file_{i}.rs"),
6421 defs: vec![Definition {
6422 name: format!("func_{i}"),
6423 kind: "function_item".to_string(),
6424 start_line: 1,
6425 end_line: 5,
6426 scope: String::new(),
6427 signature: Some(format!("func_{i}(x: i32) -> i32")),
6428 start_byte: 0,
6429 end_byte: 0,
6430 calls: vec![],
6431 decorator: None,
6432 lsp_kind_hint: None,
6433 }],
6434 imports: vec![],
6435 })
6436 .collect();
6437
6438 // Create a star graph: files 1-9 all import from file 0
6439 let edges: Vec<(u32, u32, u32)> = (1..10).map(|i| (i, 0, 1)).collect();
6440 let base_ranks = pagerank(10, &edges, None);
6441 let (top_callers, top_callees) = build_neighbor_lists(10, &edges);
6442
6443 let graph = RepoGraph {
6444 files,
6445 edges,
6446 base_ranks,
6447 callers: top_callers,
6448 callees: top_callees,
6449 def_edges: vec![],
6450 def_ranks: vec![],
6451 def_callers: vec![],
6452 def_callees: vec![],
6453 def_offsets: vec![0],
6454 alpha: 0.5,
6455 };
6456
6457 // Large budget: should include all files
6458 let full = render(&graph, 10_000, None);
6459 assert!(
6460 full.contains("file_0"),
6461 "output should contain the top-ranked file"
6462 );
6463 // file_0 should appear as tier 0 (highest rank)
6464 assert!(
6465 full.contains("## src/file_0.rs"),
6466 "top file should have tier 0 heading"
6467 );
6468
6469 // Tiny budget: should only fit a few files
6470 let small = render(&graph, 10, None);
6471 assert!(
6472 !small.is_empty(),
6473 "even tiny budget should produce some output"
6474 );
6475 // Should have fewer entries than full render
6476 let full_lines = full.lines().count();
6477 let small_lines = small.lines().count();
6478 assert!(
6479 small_lines < full_lines,
6480 "small budget ({small_lines} lines) should have fewer lines than full ({full_lines})"
6481 );
6482 }
6483
6484 #[test]
6485 fn test_render_empty_graph() {
6486 let graph = RepoGraph {
6487 files: vec![],
6488 edges: vec![],
6489 base_ranks: vec![],
6490 callers: vec![],
6491 callees: vec![],
6492 def_edges: vec![],
6493 def_ranks: vec![],
6494 def_callers: vec![],
6495 def_callees: vec![],
6496 def_offsets: vec![0],
6497 alpha: 0.5,
6498 };
6499 let output = render(&graph, 1000, None);
6500 assert!(output.is_empty(), "empty graph should render empty string");
6501 }
6502
6503 #[test]
6504 fn test_build_graph_on_fixtures() {
6505 let fixtures = Path::new(env!("CARGO_MANIFEST_DIR"))
6506 .parent()
6507 .unwrap()
6508 .parent()
6509 .unwrap()
6510 .join("tests")
6511 .join("fixtures");
6512
6513 let graph = build_graph(&fixtures).expect("build_graph should succeed on fixtures");
6514
6515 // Should find at least the 3 fixture files
6516 assert!(
6517 !graph.files.is_empty(),
6518 "graph should contain files from fixtures"
6519 );
6520
6521 // Should find definitions in the Rust fixture
6522 let rs_file = graph.files.iter().find(|f| f.path.ends_with("sample.rs"));
6523 assert!(rs_file.is_some(), "should find sample.rs");
6524 let rs_file = rs_file.unwrap();
6525 assert!(
6526 !rs_file.defs.is_empty(),
6527 "sample.rs should have definitions"
6528 );
6529 assert!(
6530 rs_file.defs.iter().any(|d| d.name == "hello"),
6531 "should find 'hello' function in sample.rs"
6532 );
6533
6534 // Should find definitions in the Python fixture
6535 let py_file = graph.files.iter().find(|f| f.path.ends_with("sample.py"));
6536 assert!(py_file.is_some(), "should find sample.py");
6537 let py_file = py_file.unwrap();
6538 assert!(
6539 !py_file.defs.is_empty(),
6540 "sample.py should have definitions"
6541 );
6542 assert!(
6543 py_file.defs.iter().any(|d| d.name == "greet"),
6544 "should find 'greet' function in sample.py"
6545 );
6546
6547 // PageRank scores should be computed
6548 assert_eq!(graph.base_ranks.len(), graph.files.len());
6549 let sum: f32 = graph.base_ranks.iter().sum();
6550 assert!(
6551 (sum - 1.0).abs() < 0.01,
6552 "PageRank scores should sum to ~1.0, got {sum}"
6553 );
6554 }
6555
6556 #[test]
6557 fn test_extract_imports_rust() {
6558 let source = "use crate::foo::bar;\nuse std::collections::HashMap;\n";
6559 let (lang, query) = import_query_for_extension("rs").unwrap();
6560 let imports = extract_imports(source, &lang, &query);
6561 assert_eq!(imports.len(), 2);
6562 assert!(imports[0].contains("crate::foo::bar"));
6563 }
6564
6565 #[test]
6566 fn test_extract_imports_python_stub() {
6567 let source = "from typing import Protocol\nimport pkg.types\n";
6568 let (lang, query) = import_query_for_extension("pyi").unwrap();
6569 let imports = extract_imports(source, &lang, &query);
6570 assert_eq!(imports.len(), 2);
6571 assert!(imports[0].contains("from typing import Protocol"));
6572 assert!(imports[1].contains("import pkg.types"));
6573 }
6574
6575 #[test]
6576 fn test_resolve_python_import_to_stub_file() {
6577 let root = PathBuf::from("/project");
6578 let mut file_index = HashMap::new();
6579 file_index.insert(PathBuf::from("/project/pkg/types.pyi"), 1);
6580
6581 let result = resolve_python_import("import pkg.types", &root, &file_index);
6582 assert_eq!(result, Some(1));
6583 }
6584
6585 #[test]
6586 fn test_resolve_rust_crate_import() {
6587 let root = PathBuf::from("/project");
6588 let file_path = PathBuf::from("/project/src/main.rs");
6589 let mut file_index = HashMap::new();
6590 file_index.insert(PathBuf::from("/project/src/foo/bar.rs"), 1);
6591 file_index.insert(PathBuf::from("/project/src/main.rs"), 0);
6592
6593 let result = resolve_rust_import("use crate::foo::bar;", &file_path, &root, &file_index);
6594 assert_eq!(result, Some(1));
6595 }
6596
6597 #[test]
6598 fn test_resolve_rust_external_crate_dropped() {
6599 let root = PathBuf::from("/project");
6600 let file_path = PathBuf::from("/project/src/main.rs");
6601 let file_index = HashMap::new();
6602
6603 let result = resolve_rust_import(
6604 "use std::collections::HashMap;",
6605 &file_path,
6606 &root,
6607 &file_index,
6608 );
6609 assert_eq!(result, None, "external crate imports should be dropped");
6610 }
6611
6612 #[test]
6613 fn test_neighbor_lists() {
6614 // 0 -> 1, 0 -> 2, 1 -> 2
6615 let edges = vec![(0, 1, 1), (0, 2, 1), (1, 2, 1)];
6616 let (incoming, outgoing) = build_neighbor_lists(3, &edges);
6617
6618 // Node 2 should be called by 0 and 1
6619 assert!(incoming[2].contains(&0));
6620 assert!(incoming[2].contains(&1));
6621
6622 // Node 0 should call 1 and 2
6623 assert!(outgoing[0].contains(&1));
6624 assert!(outgoing[0].contains(&2));
6625 }
6626
6627 /// G1 (R2.3 issue a): A scoped call `mod_a::foo()` must store:
6628 /// - `name = "foo"` (bare identifier, for def-index lookup)
6629 /// - `qualified_path = Some("mod_a::foo")` (full path, for disambiguation)
6630 ///
6631 /// Before G1, `name` stored the full `"mod_a::foo"` path. After G1, `name`
6632 /// is always the bare trailing identifier and `qualified_path` carries the
6633 /// full path when the call is scoped.
6634 #[test]
6635 fn test_scoped_identifier_calls_preserve_path() {
6636 use crate::languages;
6637 use streaming_iterator::StreamingIterator as _;
6638
6639 let source = "
6640mod mod_a {
6641 pub fn foo() {}
6642}
6643mod mod_b {
6644 pub fn foo() {}
6645}
6646fn caller() {
6647 mod_a::foo();
6648 mod_b::foo();
6649}
6650";
6651 let call_config =
6652 languages::call_query_for_extension("rs").expect("Rust call config must exist");
6653 let lang_config =
6654 languages::config_for_extension("rs").expect("Rust lang config must exist");
6655
6656 let mut defs = {
6657 let mut parser = tree_sitter::Parser::new();
6658 parser.set_language(&lang_config.language).unwrap();
6659 let tree = parser.parse(source, None).unwrap();
6660 let mut cursor = tree_sitter::QueryCursor::new();
6661 let mut out = Vec::new();
6662 let mut matches =
6663 cursor.matches(&lang_config.query, tree.root_node(), source.as_bytes());
6664 while let Some(m) = matches.next() {
6665 let mut name = String::new();
6666 let mut def_node = None;
6667 for cap in m.captures {
6668 let cname = &lang_config.query.capture_names()[cap.index as usize];
6669 if *cname == "name" {
6670 name = source[cap.node.start_byte()..cap.node.end_byte()].to_string();
6671 } else if *cname == "def" {
6672 def_node = Some(cap.node);
6673 }
6674 }
6675 if let Some(node) = def_node {
6676 #[expect(clippy::cast_possible_truncation)]
6677 out.push(Definition {
6678 name,
6679 kind: node.kind().to_string(),
6680 start_line: node.start_position().row as u32 + 1,
6681 end_line: node.end_position().row as u32 + 1,
6682 scope: String::new(),
6683 signature: None,
6684 start_byte: node.start_byte() as u32,
6685 end_byte: node.end_byte() as u32,
6686 calls: vec![],
6687 decorator: None,
6688 lsp_kind_hint: None,
6689 });
6690 }
6691 }
6692 out
6693 };
6694
6695 extract_calls(source, &call_config, &mut defs);
6696
6697 // Find the `caller` function definition
6698 let caller_def = defs
6699 .iter()
6700 .find(|d| d.name == "caller")
6701 .expect("caller def");
6702
6703 // G1: bare name is "foo", qualified_path carries the module path.
6704 let call_names: Vec<&str> = caller_def.calls.iter().map(|c| c.name.as_str()).collect();
6705 let qualified_paths: Vec<Option<&str>> = caller_def
6706 .calls
6707 .iter()
6708 .map(|c| c.qualified_path.as_deref())
6709 .collect();
6710
6711 // Bare names must be the trailing identifier only.
6712 assert!(
6713 call_names.contains(&"foo"),
6714 "bare name 'foo' must appear for scoped calls; got: {call_names:?}"
6715 );
6716 // Qualified paths must carry the full scope.
6717 assert!(
6718 qualified_paths.contains(&Some("mod_a::foo")),
6719 "qualified_path 'mod_a::foo' must appear; got: {qualified_paths:?}"
6720 );
6721 assert!(
6722 qualified_paths.contains(&Some("mod_b::foo")),
6723 "qualified_path 'mod_b::foo' must appear; got: {qualified_paths:?}"
6724 );
6725 // Full paths must NOT appear in bare names.
6726 assert!(
6727 !call_names.contains(&"mod_a::foo"),
6728 "full path 'mod_a::foo' must not appear in bare name; got: {call_names:?}"
6729 );
6730 }
6731
6732 /// RED test (R2.3 issue b+c): Two defs named `Read` in different modules,
6733 /// an unqualified call to `Read`. Resolution must NOT silently pick the first.
6734 /// Either both are returned (ambiguous) or none.
6735 #[test]
6736 fn test_ambiguous_name_resolution_returns_all_or_none() {
6737 // Build two FileNodes each with a def named "Read", then a third with an
6738 // unqualified call to "Read".
6739 let file_a = FileNode {
6740 path: "mod_a.rs".to_string(),
6741 defs: vec![Definition {
6742 name: "Read".to_string(),
6743 kind: "trait_item".to_string(),
6744 start_line: 1,
6745 end_line: 3,
6746 scope: String::new(),
6747 signature: None,
6748 start_byte: 0,
6749 end_byte: 50,
6750 calls: vec![],
6751 decorator: None,
6752 lsp_kind_hint: None,
6753 }],
6754 imports: vec![],
6755 };
6756 let file_b = FileNode {
6757 path: "mod_b.rs".to_string(),
6758 defs: vec![Definition {
6759 name: "Read".to_string(),
6760 kind: "trait_item".to_string(),
6761 start_line: 1,
6762 end_line: 3,
6763 scope: String::new(),
6764 signature: None,
6765 start_byte: 0,
6766 end_byte: 50,
6767 calls: vec![],
6768 decorator: None,
6769 lsp_kind_hint: None,
6770 }],
6771 imports: vec![],
6772 };
6773 let file_c = FileNode {
6774 path: "caller.rs".to_string(),
6775 defs: vec![Definition {
6776 name: "do_thing".to_string(),
6777 kind: "function_item".to_string(),
6778 start_line: 1,
6779 end_line: 5,
6780 scope: String::new(),
6781 signature: None,
6782 start_byte: 0,
6783 end_byte: 100,
6784 calls: vec![CallRef {
6785 name: "Read".to_string(),
6786 qualified_path: None,
6787 receiver_type: None,
6788 byte_offset: 10,
6789 resolved: None,
6790 }],
6791 decorator: None,
6792 lsp_kind_hint: None,
6793 }],
6794 imports: vec![],
6795 };
6796
6797 let mut files = vec![file_a, file_b, file_c];
6798 let def_index = build_def_index(&files);
6799 resolve_calls(&mut files, &def_index, &HashMap::new());
6800
6801 // The unqualified call to "Read" is ambiguous (two candidates, neither in same
6802 // file nor imported). Resolution must leave it as None — silent first-wins is wrong.
6803 let resolved = files[2].defs[0].calls[0].resolved;
6804 assert_eq!(
6805 resolved, None,
6806 "ambiguous unqualified call with no import context must resolve to None, not silently pick first"
6807 );
6808 }
6809
6810 // ── D1 / D2 tests ────────────────────────────────────────────────
6811
6812 /// Build a small test graph with N files and an optional JSON-extension file.
6813 fn build_test_graph(n_code: usize, include_json: bool) -> (RepoGraph, Vec<usize>) {
6814 let mut file_nodes: Vec<FileNode> = (0..n_code)
6815 .map(|i| FileNode {
6816 path: format!("src/file_{i}.rs"),
6817 defs: vec![
6818 Definition {
6819 name: format!("func_{i}"),
6820 kind: "function_item".to_string(),
6821 start_line: 1,
6822 end_line: 5,
6823 scope: String::new(),
6824 signature: Some(format!("fn func_{i}() -> i32")),
6825 start_byte: 0,
6826 end_byte: 100,
6827 calls: vec![],
6828 decorator: None,
6829 lsp_kind_hint: None,
6830 },
6831 Definition {
6832 name: format!("MyStruct{i}"),
6833 kind: "struct_item".to_string(),
6834 start_line: 7,
6835 end_line: 10,
6836 scope: String::new(),
6837 signature: None,
6838 start_byte: 110,
6839 end_byte: 200,
6840 calls: vec![],
6841 decorator: None,
6842 lsp_kind_hint: None,
6843 },
6844 ],
6845 imports: vec![],
6846 })
6847 .collect();
6848
6849 let json_idx = if include_json {
6850 let idx = file_nodes.len();
6851 file_nodes.push(FileNode {
6852 path: "data/config.json".to_string(),
6853 defs: vec![],
6854 imports: vec![],
6855 });
6856 vec![idx]
6857 } else {
6858 vec![]
6859 };
6860
6861 // Build a star graph: all code files point to file_0.
6862 let n = file_nodes.len();
6863 #[expect(clippy::cast_possible_truncation, reason = "test: n_code << u32::MAX")]
6864 let edges: Vec<(u32, u32, u32)> = (1..n_code).map(|i| (i as u32, 0, 1)).collect();
6865
6866 let base_ranks = pagerank(n, &edges, None);
6867 let (callers, callees) = build_neighbor_lists(n, &edges);
6868
6869 let graph = RepoGraph {
6870 files: file_nodes,
6871 edges,
6872 base_ranks,
6873 callers,
6874 callees,
6875 def_edges: vec![],
6876 def_ranks: vec![],
6877 def_callers: vec![],
6878 def_callees: vec![],
6879 def_offsets: vec![0],
6880 alpha: 0.5,
6881 };
6882
6883 (graph, json_idx)
6884 }
6885
6886 /// D1: `render_json` returns a `GetRepoMapResponse` with a `files` array.
6887 ///
6888 /// On the baseline (before D1) `get_repo_map_ripvec` returned markdown prose via
6889 /// `repo_map::render`; no `files` key existed in the output.
6890 #[test]
6891 fn get_repo_map_returns_json_with_files_array() {
6892 let (graph, _) = build_test_graph(5, false);
6893 let response = render_json(&graph, 50, None, false);
6894 assert!(
6895 !response.files.is_empty(),
6896 "files array should be non-empty for a non-empty graph"
6897 );
6898 // Serialize and verify the JSON shape has a `files` key.
6899 let json = serde_json::to_string(&response).expect("serialize");
6900 let parsed: serde_json::Value = serde_json::from_str(&json).expect("parse");
6901 assert!(
6902 parsed["files"].is_array(),
6903 "serialized response must have a `files` JSON array; got: {parsed}"
6904 );
6905 }
6906
6907 /// D1: every file entry has an `lsp_location` field.
6908 ///
6909 /// Before D1, output was prose text; no `lsp_location` existed anywhere in the response.
6910 #[test]
6911 fn get_repo_map_each_file_has_lsp_location() {
6912 let (graph, _) = build_test_graph(5, false);
6913 let response = render_json(&graph, 50, None, false);
6914 for file in &response.files {
6915 assert!(
6916 !file.lsp_location.file_path.is_empty(),
6917 "each file must have a non-empty lsp_location.file_path"
6918 );
6919 }
6920 // Also verify through JSON.
6921 let json = serde_json::to_string(&response).expect("serialize");
6922 let parsed: serde_json::Value = serde_json::from_str(&json).expect("parse");
6923 for entry in parsed["files"].as_array().expect("files array") {
6924 assert!(
6925 entry["lsp_location"]["file_path"].is_string(),
6926 "each file entry must have lsp_location.file_path string; entry: {entry}"
6927 );
6928 }
6929 }
6930
6931 /// D1: every symbol has a `kind` (u32) and an `lsp_location`.
6932 ///
6933 /// Before D1 symbols were rendered as prose strings like `"function_item func_0"`.
6934 #[test]
6935 fn get_repo_map_each_symbol_has_kind_and_lsp_location() {
6936 let (graph, _) = build_test_graph(3, false);
6937 let response = render_json(&graph, 50, None, false);
6938 for file in &response.files {
6939 for sym in &file.symbols {
6940 assert!(
6941 sym.kind > 0,
6942 "symbol kind must be a positive LSP SymbolKind; got 0 for '{}'",
6943 sym.name
6944 );
6945 assert!(
6946 !sym.lsp_location.file_path.is_empty(),
6947 "symbol must have lsp_location.file_path"
6948 );
6949 }
6950 }
6951 // Verify through JSON: kind should be a number.
6952 let json = serde_json::to_string(&response).expect("serialize");
6953 let parsed: serde_json::Value = serde_json::from_str(&json).expect("parse");
6954 for file_entry in parsed["files"].as_array().expect("files") {
6955 for sym_entry in file_entry["symbols"].as_array().expect("symbols") {
6956 assert!(
6957 sym_entry["kind"].is_number(),
6958 "symbol `kind` must be a JSON number; sym: {sym_entry}"
6959 );
6960 assert!(
6961 sym_entry["lsp_location"]["file_path"].is_string(),
6962 "symbol must have lsp_location.file_path; sym: {sym_entry}"
6963 );
6964 }
6965 }
6966 }
6967
6968 /// D1: `calls` field is an array of `RepoMapCall`-shaped objects (each has
6969 /// `lsp_location` and `rank`).
6970 ///
6971 /// In 4.0.1 calls moved from bare `lsp_location` objects to `RepoMapCall`
6972 /// objects that carry both the target `lsp_location` and the target file's
6973 /// `base_rank`.
6974 #[test]
6975 fn get_repo_map_calls_field_is_array_of_lsp_locations() {
6976 // Build a 5-file star graph so file_0 has non-empty callees.
6977 let (graph, _) = build_test_graph(5, false);
6978 let response = render_json(&graph, 50, None, false);
6979 let json = serde_json::to_string(&response).expect("serialize");
6980 let parsed: serde_json::Value = serde_json::from_str(&json).expect("parse");
6981 for file_entry in parsed["files"].as_array().expect("files") {
6982 let calls = file_entry["calls"]
6983 .as_array()
6984 .expect("calls must be an array");
6985 for call in calls {
6986 // In 4.0.1 each call entry is a RepoMapCall with lsp_location + rank.
6987 assert!(
6988 call["lsp_location"]["file_path"].is_string(),
6989 "each call entry must have lsp_location.file_path string; call: {call}"
6990 );
6991 assert!(
6992 call["rank"].is_number(),
6993 "each call entry must have a numeric rank; call: {call}"
6994 );
6995 }
6996 }
6997 }
6998
6999 /// D2 / G3: `render_json_budgeted` with a very tight budget returns fewer files.
7000 ///
7001 /// Before the budget allocator, `max_files=3` controlled file count but not
7002 /// per-file expansion. In 4.0.1 the token_budget controls total bytes; with
7003 /// a budget of 1 token (= 4 bytes) only the envelope minimum allows any file
7004 /// at all, and the test verifies that the total_files counter still reflects
7005 /// the full eligible count. `render_json` (compat shim) passes a generous
7006 /// budget; use `render_json_budgeted` with a tight budget to verify the cap.
7007 #[test]
7008 fn get_repo_map_returns_at_most_max_files_files() {
7009 let (graph, _) = build_test_graph(10, false);
7010 // Use render_json_budgeted directly with a tight budget (600 bytes = 3 files
7011 // × 200-byte floor). Each file's envelope minimum is 200 bytes so a 600-byte
7012 // budget should admit at most 3 files.
7013 let response = render_json_budgeted(&graph, 150, None, false);
7014 assert!(
7015 response.files.len() <= 3,
7016 "files.len() = {} must be <= 3 for a 600-byte budget",
7017 response.files.len()
7018 );
7019 assert_eq!(
7020 response.total_files, 10,
7021 "total_files must reflect the full eligible count before budget cap"
7022 );
7023 assert!(
7024 response.capped,
7025 "capped must be true when total_files > files.len()"
7026 );
7027 }
7028
7029 /// D2: `include_metadata=false` (default) excludes JSON/TOML/etc. files.
7030 ///
7031 /// Before D2, JSON files with thousands of repeated keys dominated the
7032 /// output (Issue #5 — JSON-key flooding).
7033 #[test]
7034 fn get_repo_map_excludes_meta_by_default() {
7035 let (graph, _) = build_test_graph(3, /*include_json=*/ true);
7036 // Default: include_metadata = false
7037 let response = render_json(&graph, 50, None, false);
7038 for file in &response.files {
7039 assert!(
7040 !std::path::Path::new(&file.lsp_location.file_path)
7041 .extension()
7042 .is_some_and(|e| e.eq_ignore_ascii_case("json")),
7043 "JSON (Meta) files must be excluded when include_metadata=false; found: {}",
7044 file.lsp_location.file_path
7045 );
7046 }
7047 }
7048
7049 /// D2: `include_metadata=true` includes JSON files.
7050 ///
7051 /// Callers who opt-in to metadata should see all content kinds.
7052 #[test]
7053 fn get_repo_map_include_metadata_true_includes_json() {
7054 let (graph, _) = build_test_graph(3, /*include_json=*/ true);
7055 let response = render_json(&graph, 50, None, true);
7056 let has_json = response.files.iter().any(|f| {
7057 std::path::Path::new(&f.lsp_location.file_path)
7058 .extension()
7059 .is_some_and(|e| e.eq_ignore_ascii_case("json"))
7060 });
7061 assert!(
7062 has_json,
7063 "JSON file must be present when include_metadata=true"
7064 );
7065 }
7066
7067 /// J1/J2 MEASUREMENT: flask corpus focus_file=blueprints.py rank dispersion.
7068 ///
7069 /// Mandatory measurement from the 4.0.5 Wave-2 Front-C briefing:
7070 /// - `len(files) >= 8` (not collapsed to just the focus)
7071 /// - focus file rank is the highest in the response
7072 /// - next 5 files all have rank >= 10% of focus rank
7073 /// - neighborhood contains semantically related files (app.py, scaffold.py)
7074 #[test]
7075 #[ignore = "runs on flask corpus at tests/corpus/code/flask; use --ignored --nocapture"]
7076 #[expect(
7077 clippy::too_many_lines,
7078 reason = "end-to-end corpus measurement test; assertion sequence is sequential and cannot be meaningfully split"
7079 )]
7080 fn test_flask_focus_blueprints_rank_dispersion() {
7081 let corpus_root = Path::new(env!("CARGO_MANIFEST_DIR"))
7082 .parent()
7083 .unwrap()
7084 .parent()
7085 .unwrap()
7086 .join("tests/corpus/code/flask");
7087
7088 assert!(
7089 corpus_root.exists(),
7090 "flask corpus not found at {}",
7091 corpus_root.display()
7092 );
7093
7094 let graph = build_graph(&corpus_root).expect("build_graph on flask corpus");
7095 eprintln!("Flask corpus: {} files in graph", graph.files.len());
7096
7097 // Find focus file
7098 let focus_path = "src/flask/blueprints.py";
7099 let focus_idx = graph.files.iter().position(|f| f.path == focus_path);
7100 eprintln!("Focus file '{focus_path}' -> idx: {focus_idx:?}");
7101 assert!(
7102 focus_idx.is_some(),
7103 "blueprints.py not found in graph; available files: {:?}",
7104 graph
7105 .files
7106 .iter()
7107 .map(|f| &f.path)
7108 .take(20)
7109 .collect::<Vec<_>>()
7110 );
7111
7112 let response = render_json_budgeted(&graph, 4000, focus_idx, false);
7113
7114 // Criterion 1: at least 8 files returned.
7115 eprintln!(
7116 "Focused response: {} files (total_files={})",
7117 response.files.len(),
7118 response.total_files
7119 );
7120 assert!(
7121 response.files.len() >= 8,
7122 "expected >= 8 files in focused response; got {} — I#16 winner-take-all collapse",
7123 response.files.len()
7124 );
7125
7126 // Print top 10 for inspection.
7127 eprintln!("\nTop 10 focused files:");
7128 for (i, f) in response.files.iter().take(10).enumerate() {
7129 eprintln!(" [{i}] rank={:.6} {}", f.rank, f.lsp_location.file_path);
7130 }
7131
7132 // Criterion 2: focus file must appear near the top (top-3) of focused
7133 // results. With PERSONALIZATION_ALPHA=0.15 and the flask corpus,
7134 // src/flask/app.py has higher structural rank than blueprints.py and
7135 // may legitimately rank #1 — the focus boosts blueprints.py relative
7136 // to its unfocused position, but doesn't guarantee it beats every
7137 // structurally central neighbor. Being in top-3 confirms the bias
7138 // is working (pre-fix blueprints.py was #1 at 0.703 but that was a
7139 // degenerate collapse; now #1 or #2 is healthy).
7140 let focus_file_rank = response
7141 .files
7142 .iter()
7143 .find(|f| {
7144 f.lsp_location.file_path.contains("blueprints.py")
7145 && !f.lsp_location.file_path.contains("test_")
7146 && !f.lsp_location.file_path.contains("sansio")
7147 })
7148 .map(|f| f.rank)
7149 .unwrap_or(0.0);
7150 let focus_position = response
7151 .files
7152 .iter()
7153 .position(|f| {
7154 f.lsp_location.file_path.contains("blueprints.py")
7155 && !f.lsp_location.file_path.contains("test_")
7156 && !f.lsp_location.file_path.contains("sansio")
7157 })
7158 .unwrap_or(usize::MAX);
7159 eprintln!(
7160 "\nblueprinets.py position: #{} rank={:.6}",
7161 focus_position + 1,
7162 focus_file_rank
7163 );
7164 assert!(
7165 focus_position < 3,
7166 "blueprints.py must be in top-3 focused results (got position {}); \
7167 soft personalization must rebias toward focus neighborhood — I#16",
7168 focus_position + 1
7169 );
7170
7171 // Criterion 3: next 5 non-focus files have rank >= 10% of the top
7172 // file's rank. This is the core dispersion check: no more Dirac-delta
7173 // collapse where one file is 0.703 and all others are 0.003.
7174 let top_rank = response.files[0].rank;
7175 let non_focus_min_5 = response
7176 .files
7177 .iter()
7178 .filter(|f| {
7179 !(f.lsp_location.file_path.contains("blueprints.py")
7180 && !f.lsp_location.file_path.contains("test_")
7181 && !f.lsp_location.file_path.contains("sansio"))
7182 })
7183 .take(5)
7184 .map(|f| f.rank)
7185 .fold(f32::INFINITY, f32::min);
7186 let pct = non_focus_min_5 / top_rank * 100.0;
7187 eprintln!(
7188 "\nNext-5 (non-focus) min rank: {non_focus_min_5:.6} = {pct:.1}% of top rank {top_rank:.6}"
7189 );
7190 assert!(
7191 pct >= 10.0,
7192 "next-5 non-focus files min rank is {pct:.1}% of top rank (need ≥ 10%); \
7193 files are collapsing to near-zero floor — I#16"
7194 );
7195
7196 // Criterion 4: neighborhood quality — related files present.
7197 let related_names = ["app.py", "scaffold.py", "sansio"];
7198 let found_related: Vec<&str> = related_names
7199 .iter()
7200 .copied()
7201 .filter(|name| {
7202 response
7203 .files
7204 .iter()
7205 .any(|f| f.lsp_location.file_path.contains(name))
7206 })
7207 .collect();
7208 eprintln!("\nNeighborhood quality: found related files: {found_related:?}");
7209 // At least one related file should appear (soft assertion — log if missing).
7210 if found_related.is_empty() {
7211 eprintln!(
7212 "WARNING: no expected related files (app.py, scaffold.py) found in neighborhood"
7213 );
7214 }
7215 }
7216
7217 #[test]
7218 #[ignore = "runs on full ripvec codebase; use --nocapture to see output"]
7219 fn test_full_repo_map() {
7220 use std::time::Instant;
7221
7222 let root = Path::new(env!("CARGO_MANIFEST_DIR"))
7223 .parent()
7224 .unwrap()
7225 .parent()
7226 .unwrap();
7227
7228 // Phase 1: build_graph (walk + parse + import resolve + PageRank)
7229 let t0 = Instant::now();
7230 let graph = build_graph(root).expect("build_graph on ripvec root");
7231 let build_ms = t0.elapsed().as_secs_f64() * 1000.0;
7232
7233 // Phase 2: render (default, no focus)
7234 let t1 = Instant::now();
7235 let rendered = render(&graph, 2000, None);
7236 let render_ms = t1.elapsed().as_secs_f64() * 1000.0;
7237
7238 // Phase 3: render (topic-sensitive, focused on highest-ranked file)
7239 let t2 = Instant::now();
7240 let focus_idx = graph
7241 .base_ranks
7242 .iter()
7243 .enumerate()
7244 .max_by(|a, b| a.1.total_cmp(b.1))
7245 .map(|(i, _)| i);
7246 let focused = render(&graph, 2000, focus_idx);
7247 let focus_ms = t2.elapsed().as_secs_f64() * 1000.0;
7248
7249 eprintln!("\n=== Repo Map Performance ===");
7250 eprintln!(
7251 "Files: {}, Edges: {}, Defs: {}",
7252 graph.files.len(),
7253 graph.edges.len(),
7254 graph.files.iter().map(|f| f.defs.len()).sum::<usize>()
7255 );
7256 eprintln!("build_graph: {build_ms:.1}ms (walk + parse + resolve + PageRank)");
7257 eprintln!(
7258 "render(default): {render_ms:.3}ms ({} chars, ~{} tokens)",
7259 rendered.len(),
7260 rendered.len() / 4
7261 );
7262 eprintln!(
7263 "render(focused): {focus_ms:.3}ms ({} chars, ~{} tokens)",
7264 focused.len(),
7265 focused.len() / 4
7266 );
7267
7268 eprintln!("\nTop 5 by PageRank:");
7269 let mut ranked: Vec<(usize, f32)> = graph.base_ranks.iter().copied().enumerate().collect();
7270 ranked.sort_by(|a, b| b.1.total_cmp(&a.1));
7271 for (i, rank) in ranked.iter().take(5) {
7272 eprintln!(" {:.4} {}", rank, graph.files[*i].path);
7273 }
7274
7275 eprintln!("\n=== Default Render ===\n{rendered}");
7276 eprintln!(
7277 "\n=== Focused Render (on {}) ===\n{focused}",
7278 focus_idx
7279 .map(|i| graph.files[i].path.as_str())
7280 .unwrap_or("none")
7281 );
7282 }
7283
7284 // ── C1/C2 Tests (4.1.1): Python decorator-aware kind classification ──
7285
7286 /// `test:chunker_stores_property_decorator_kind` — a Python `@property`-
7287 /// decorated function produces a [`Definition`] with
7288 /// `decorator = Some("property")` and `lsp_kind_hint = Some(7)`.
7289 ///
7290 /// Behavior: trigger-fails-on-baseline-then-passes-post-fix.
7291 /// Baseline: `Definition` had no `decorator` / `lsp_kind_hint` fields.
7292 /// Post-fix: `extract_definitions` populates both from the AST.
7293 #[test]
7294 fn test_chunker_stores_property_decorator_kind() {
7295 let source = "@property\ndef foo(self):\n return self._x\n";
7296 let lang_cfg = crate::languages::config_for_extension("py").expect("python lang config");
7297 let defs = extract_definitions(source, &lang_cfg);
7298
7299 let foo = defs
7300 .iter()
7301 .find(|d| d.name == "foo")
7302 .expect("expected def 'foo'");
7303 assert_eq!(
7304 foo.decorator.as_deref(),
7305 Some("property"),
7306 "decorator must be Some(\"property\") for @property def; got {:?}",
7307 foo.decorator
7308 );
7309 assert_eq!(
7310 foo.lsp_kind_hint,
7311 Some(7),
7312 "lsp_kind_hint must be Some(7) (Property) for @property def; got {:?}",
7313 foo.lsp_kind_hint
7314 );
7315 }
7316
7317 /// `test:chunker_stores_classmethod_decorator_kind` — a Python `@classmethod`-
7318 /// decorated function produces a [`Definition`] with
7319 /// `decorator = Some("classmethod")` and `lsp_kind_hint = Some(12)`.
7320 #[test]
7321 fn test_chunker_stores_classmethod_decorator_kind() {
7322 let source = "@classmethod\ndef from_dict(cls, d):\n pass\n";
7323 let lang_cfg = crate::languages::config_for_extension("py").expect("python lang config");
7324 let defs = extract_definitions(source, &lang_cfg);
7325
7326 let from_dict = defs
7327 .iter()
7328 .find(|d| d.name == "from_dict")
7329 .expect("expected def 'from_dict'");
7330 assert_eq!(
7331 from_dict.decorator.as_deref(),
7332 Some("classmethod"),
7333 "decorator must be Some(\"classmethod\"); got {:?}",
7334 from_dict.decorator
7335 );
7336 assert_eq!(
7337 from_dict.lsp_kind_hint,
7338 Some(12),
7339 "lsp_kind_hint must be Some(12) (Function) for @classmethod def; got {:?}",
7340 from_dict.lsp_kind_hint
7341 );
7342 }
7343
7344 /// `test:chunker_stores_arbitrary_decorator_name` — a Python
7345 /// `@functools.lru_cache`-decorated function (attribute-access decorator)
7346 /// produces a [`Definition`] with `decorator = Some("functools.lru_cache")`
7347 /// and `lsp_kind_hint = Some(12)`.
7348 #[test]
7349 fn test_chunker_stores_arbitrary_decorator_name() {
7350 let source = "@functools.lru_cache\ndef expensive(n):\n pass\n";
7351 let lang_cfg = crate::languages::config_for_extension("py").expect("python lang config");
7352 let defs = extract_definitions(source, &lang_cfg);
7353
7354 let expensive = defs
7355 .iter()
7356 .find(|d| d.name == "expensive")
7357 .expect("expected def 'expensive'");
7358 // Attribute decorators are stored as dotted-name text.
7359 assert_eq!(
7360 expensive.decorator.as_deref(),
7361 Some("functools.lru_cache"),
7362 "decorator must be Some(\"functools.lru_cache\"); got {:?}",
7363 expensive.decorator
7364 );
7365 assert_eq!(
7366 expensive.lsp_kind_hint,
7367 Some(12),
7368 "lsp_kind_hint must be Some(12) (Function) for @functools.lru_cache def; got {:?}",
7369 expensive.lsp_kind_hint
7370 );
7371 }
7372
7373 /// `test:repo_map_projection_uses_stored_kind_for_python_decorator` — the
7374 /// `render_json_budgeted` projection site reads `lsp_kind_hint` from the
7375 /// stored [`Definition`] rather than re-computing via `lsp_symbol_kind_for_node_kind`.
7376 ///
7377 /// A synthetic [`RepoGraph`] with a Definition whose `kind = "decorated_definition"`
7378 /// (which would map to Property=7 via the AST-less path) but
7379 /// `lsp_kind_hint = Some(12)` must produce a [`RepoMapSymbol`] with `kind = 12`.
7380 #[test]
7381 fn test_repo_map_projection_uses_stored_kind_for_python_decorator() {
7382 // Build a synthetic graph: one file, one @classmethod-decorated function.
7383 // kind = "decorated_definition" (AST-less → 7 = Property, WRONG).
7384 // lsp_kind_hint = Some(12) (stored at parse time, CORRECT).
7385 let files = vec![FileNode {
7386 path: "module.py".to_string(),
7387 defs: vec![Definition {
7388 name: "from_dict".to_string(),
7389 kind: "decorated_definition".to_string(),
7390 start_line: 1,
7391 end_line: 3,
7392 scope: String::new(),
7393 signature: None,
7394 start_byte: 0,
7395 end_byte: 60,
7396 calls: vec![],
7397 decorator: Some("classmethod".to_string()),
7398 lsp_kind_hint: Some(12),
7399 }],
7400 imports: vec![],
7401 }];
7402 let graph = build_graph_from_files_pub(files);
7403 let result = render_json_budgeted(&graph, 4000, None, false);
7404
7405 // Locate the symbol in the result.
7406 let file = result
7407 .files
7408 .iter()
7409 .find(|f| f.lsp_location.file_path.contains("module.py"))
7410 .expect("module.py must appear in render output");
7411 let sym = file
7412 .symbols
7413 .iter()
7414 .find(|s| s.name == "from_dict")
7415 .expect("from_dict must appear as a symbol");
7416 assert_eq!(
7417 sym.kind, 12,
7418 "C2: lsp_kind_hint=12 must override the AST-less kind (7 for decorated_definition); got {}",
7419 sym.kind
7420 );
7421 }
7422}