ripvec_core/repo_map.rs
1//! `PageRank`-weighted structural overview of a codebase.
2//!
3//! Builds a dependency graph from tree-sitter definition and import extraction,
4//! ranks files by importance using `PageRank` (standard or topic-sensitive), and
5//! renders a budget-constrained overview with tiered detail levels.
6
7use std::collections::{HashMap, HashSet};
8use std::fmt::Write as _;
9use std::path::{Path, PathBuf};
10
11use rayon::prelude::*;
12use rkyv::{Archive, Deserialize as RkyvDeserialize, Serialize as RkyvSerialize};
13use streaming_iterator::StreamingIterator;
14use tree_sitter::{Parser, Query, QueryCursor};
15
16use serde::{Deserialize, Serialize};
17
18use crate::chunk::ContentKind;
19use crate::languages;
20use crate::walk;
21
22/// Serialize a `ContentKind` to a lowercase string tag for JSON output.
23fn content_kind_tag(ck: ContentKind) -> &'static str {
24 match ck {
25 ContentKind::Code => "code",
26 ContentKind::Docs => "docs",
27 ContentKind::Meta => "meta",
28 }
29}
30
31// ── Data Structures ──────────────────────────────────────────────────
32
33/// Persisted dependency graph with `PageRank` scores.
34#[derive(Debug, Clone, Archive, RkyvSerialize, RkyvDeserialize)]
35pub struct RepoGraph {
36 /// Files in the repository with definitions, imports, and calls.
37 pub files: Vec<FileNode>,
38 /// File-level edges (derived from def-level call edges).
39 pub edges: Vec<(u32, u32, u32)>,
40 /// File-level `PageRank` scores (aggregated from def-level).
41 pub base_ranks: Vec<f32>,
42 /// File-level callers (indices into `files`).
43 pub callers: Vec<Vec<u32>>,
44 /// File-level callees (indices into `files`).
45 pub callees: Vec<Vec<u32>>,
46 /// Definition-level call edges: `(caller_def, callee_def, weight)`.
47 pub def_edges: Vec<(DefId, DefId, u32)>,
48 /// Definition-level `PageRank` scores (flattened: `offsets[file_idx] + def_idx`).
49 pub def_ranks: Vec<f32>,
50 /// Definition-level callers (flattened, parallel to `def_ranks`).
51 pub def_callers: Vec<Vec<DefId>>,
52 /// Definition-level callees (flattened, parallel to `def_ranks`).
53 pub def_callees: Vec<Vec<DefId>>,
54 /// Prefix-sum offsets for flattening `DefId` to linear index.
55 pub def_offsets: Vec<usize>,
56 /// Auto-tuned alpha for search boost.
57 pub alpha: f32,
58}
59
60/// A file in the repository with its definitions and imports.
61#[derive(Debug, Clone, Archive, RkyvSerialize, RkyvDeserialize)]
62pub struct FileNode {
63 /// Relative path from the repository root.
64 pub path: String,
65 /// Definitions (functions, structs, classes, etc.) extracted from this file.
66 pub defs: Vec<Definition>,
67 /// Import references extracted from this file.
68 pub imports: Vec<ImportRef>,
69}
70
71/// A definition extracted from a source file.
72#[derive(Debug, Clone, Default, Archive, RkyvSerialize, RkyvDeserialize)]
73pub struct Definition {
74 /// Name of the definition (e.g., function name, class name).
75 pub name: String,
76 /// Kind of syntax node (e.g., `function_item`, `class_definition`).
77 pub kind: String,
78 /// 1-based start line number.
79 pub start_line: u32,
80 /// 1-based end line number.
81 pub end_line: u32,
82 /// Scope chain (e.g., `"impl_item Foo > fn bar"`).
83 pub scope: String,
84 /// Function/method signature, if available.
85 pub signature: Option<String>,
86 /// Byte offset of this definition's start in the source file.
87 pub start_byte: u32,
88 /// Byte offset of this definition's end in the source file.
89 pub end_byte: u32,
90 /// Call sites within this definition's body.
91 pub calls: Vec<CallRef>,
92 /// The first decorator name for Python `decorated_definition` nodes (e.g.,
93 /// `"property"`, `"classmethod"`, `"staticmethod"`, `"cached_property"`).
94 ///
95 /// `None` for all non-Python definitions and for bare (undecorated) Python
96 /// functions and classes. Populated by `extract_definitions` at AST-parse
97 /// time with full tree-sitter access (C1, 4.1.1).
98 pub decorator: Option<String>,
99 /// Decorator-aware LSP SymbolKind integer (e.g., 7=Property, 12=Function).
100 ///
101 /// Computed at parse time when the AST is available so projection sites
102 /// (`render_json_budgeted`) do not need to re-parse. For Python `@property`
103 /// or `@cached_property` → 7. For `@classmethod`, `@staticmethod`, or any
104 /// other decorator → 12. `None` for all non-decorated definitions; callers
105 /// fall back to `lsp_symbol_kind_for_node_kind(&self.kind)` when `None`
106 /// (C1/C2, 4.1.1).
107 pub lsp_kind_hint: Option<u32>,
108}
109
110/// An import reference extracted from a source file.
111#[derive(Debug, Clone, Archive, RkyvSerialize, RkyvDeserialize)]
112pub struct ImportRef {
113 /// Raw import path as written in source (e.g., `crate::foo::bar`).
114 pub raw_path: String,
115 /// Resolved file index in [`RepoGraph::files`], if resolution succeeded.
116 pub resolved_idx: Option<u32>,
117}
118
119/// Unique identifier for a definition: (file index, definition index within file).
120pub type DefId = (u32, u16);
121
122/// A call site extracted from a definition body.
123#[derive(Debug, Clone, Default, Archive, RkyvSerialize, RkyvDeserialize)]
124pub struct CallRef {
125 /// Callee function/method name (bare, without qualifier).
126 ///
127 /// For scoped calls like `mod_a::foo()`, this is `"foo"`.
128 /// For bare calls like `foo()`, this is `"foo"`.
129 pub name: String,
130 /// Full qualified path for scoped calls, e.g. `Some("mod_a::foo")`.
131 ///
132 /// `None` for bare (unqualified) calls. When `Some`, `resolve_calls`
133 /// uses this for qualifier-based module disambiguation before falling
134 /// back to the bare `name`.
135 pub qualified_path: Option<String>,
136 /// Receiver type for method calls, inferred from local context.
137 ///
138 /// Set to `Some("Foo")` when:
139 /// - The call is `self.method()` inside `impl Foo { … }`.
140 /// - The call is `x.method()` where `x` has an explicit type annotation `x: Foo`.
141 /// - The call is `x.method()` after `let x = Foo::new()`.
142 ///
143 /// `None` for free function calls, or when the receiver type cannot be
144 /// inferred from local context alone. When `Some`, `resolve_calls` prefers
145 /// defs whose enclosing impl scope matches the receiver type.
146 pub receiver_type: Option<String>,
147 /// Byte offset of the call in the source file (for scoping to definitions).
148 pub byte_offset: u32,
149 /// Resolved target definition, if resolution succeeded.
150 pub resolved: Option<DefId>,
151}
152
153// ── JSON output types ────────────────────────────────────────────────
154
155/// LSP-shaped location pointing at a file or symbol within a file.
156///
157/// Lines and characters are 0-based, matching the Language Server Protocol
158/// convention so callers can pass this directly to LSP tools without any
159/// conversion.
160#[derive(Debug, Clone, Serialize)]
161pub struct RepoMapLspLocation {
162 /// Relative path from the repository root (prefixed with `./`).
163 pub file_path: String,
164 /// 0-based start line.
165 pub start_line: usize,
166 /// 0-based start character (0 for file-level locations).
167 pub start_character: usize,
168 /// 0-based end line (equals `start_line` for file-level locations).
169 pub end_line: usize,
170 /// 0-based end character (0 for file-level locations).
171 pub end_character: usize,
172}
173
174/// A top-level symbol extracted from a file in the repository map.
175///
176/// Analogous to an LSP `DocumentSymbol` but limited to the fields available
177/// from tree-sitter definition extraction. The `rank` field carries the
178/// definition-level `PageRank` score from [`RepoGraph::def_ranks`], enabling
179/// callers to prioritise symbols by structural importance.
180#[derive(Debug, Clone, Serialize)]
181pub struct RepoMapSymbol {
182 /// Symbol name (function name, struct name, etc.).
183 pub name: String,
184 /// LSP `SymbolKind` as a decimal — use the same values as
185 /// `lsp_workspace_symbols` and `lsp_document_symbols`.
186 pub kind: u32,
187 /// Location pointing at the symbol's definition line (0-based).
188 pub lsp_location: RepoMapLspLocation,
189 /// Definition-level `PageRank` score from [`RepoGraph::def_ranks`].
190 ///
191 /// Higher values indicate definitions that are called by many other
192 /// definitions. Used by the token-budget allocator to decide which
193 /// symbols to include when the per-file budget is constrained.
194 pub rank: f32,
195}
196
197/// An outgoing call-edge from a file to another file.
198///
199/// Carries both the target file's `lsp_location` and its `base_rank`
200/// (file-level `PageRank` score) so callers can decide how important
201/// each dependency is without a separate lookup.
202#[derive(Debug, Clone, Serialize)]
203pub struct RepoMapCall {
204 /// Location pointing at the target file (line 0, character 0).
205 pub lsp_location: RepoMapLspLocation,
206 /// File-level `PageRank` score of the target file.
207 pub rank: f32,
208}
209
210/// One file entry in the JSON repo map.
211///
212/// Carries the file's `PageRank` score, content kind, outgoing call-edges to
213/// other files, and the file's top-level symbol definitions — all with
214/// `lsp_location` so the caller can chain directly into LSP tools without
215/// any destructuring.
216#[derive(Debug, Clone, Serialize)]
217pub struct RepoMapFile {
218 /// Location pointing at the file itself (line 0, character 0).
219 ///
220 /// Pass `lsp_location.file_path` directly into `lsp_document_symbols` or
221 /// any other file-scoped tool.
222 pub lsp_location: RepoMapLspLocation,
223 /// `PageRank` score in [0, 1] (higher = more structurally central).
224 pub rank: f32,
225 /// Content classification: `"code"`, `"docs"`, or `"meta"`.
226 ///
227 /// Serialized as a lowercase string tag so JSON consumers can branch
228 /// without numeric magic values. Mirrors the `ContentKind` enum in
229 /// `ripvec-core::chunk`.
230 pub content_kind: &'static str,
231 /// Outgoing call-edges sorted by target file `PageRank` descending.
232 pub calls: Vec<RepoMapCall>,
233 /// Top-level definitions extracted from this file by tree-sitter,
234 /// sorted by definition-level `PageRank` descending and pruned to
235 /// the per-file token-budget allocation.
236 pub symbols: Vec<RepoMapSymbol>,
237 /// Number of symbols that were omitted due to budget exhaustion or
238 /// logarithmic attenuation cutoff. `truncated_symbols + symbols.len()`
239 /// equals the total definition count for the file.
240 pub truncated_symbols: usize,
241 /// Number of call-edges that were omitted due to the `MAX_FILE_CALLS`
242 /// render cap or byte-budget exhaustion. `truncated_calls + calls.len()`
243 /// equals the total callee count for the file (I#68, 4.1.4).
244 pub truncated_calls: usize,
245}
246
247/// JSON-mode response envelope for `get_repo_map` (4.0.1 shape).
248///
249/// Replaces the `max_files`-capped shape from 4.0.0. The caller supplies a
250/// `token_budget`; files are allocated bytes proportional to their `PageRank`
251/// (40% cap per file, 200-byte envelope floor). Symbols are filled in
252/// def-rank order with a logarithmic attenuation cutoff. Leftover bytes
253/// cascade to subsequent files.
254///
255/// The `estimated_bytes`, `budget_bytes`, and `budget_exhausted` fields give
256/// callers real-time feedback on how tightly the budget was consumed.
257#[derive(Debug, Clone, Serialize)]
258pub struct GetRepoMapResponse {
259 /// Files sorted by `PageRank` descending, pruned to the token budget.
260 pub files: Vec<RepoMapFile>,
261 /// Total number of eligible files in the graph (pre-allocation).
262 ///
263 /// If `total_files > files.len()`, the budget ran out before all files
264 /// could be included. Read `budget_exhausted` directly for the boolean.
265 pub total_files: usize,
266 /// Actual serialised-JSON byte count for all returned content.
267 pub estimated_bytes: usize,
268 /// Budget ceiling in bytes that was used for allocation
269 /// (`token_budget * 4`).
270 pub budget_bytes: usize,
271 /// `true` when `total_files > files.len()` (budget was exhausted before
272 /// all eligible files were included).
273 pub budget_exhausted: bool,
274 /// Retained for backward compatibility with 4.0.0 callers that checked
275 /// `capped`. Equivalent to `budget_exhausted`.
276 pub capped: bool,
277}
278
279// ── Constants ────────────────────────────────────────────────────────
280
281/// `PageRank` damping factor.
282const DAMPING: f32 = 0.85;
283
284/// `PageRank` convergence threshold.
285const EPSILON: f32 = 1e-6;
286
287/// Maximum `PageRank` iterations.
288const MAX_ITERATIONS: usize = 100;
289
290/// Maximum callers/callees stored per file (display-oriented neighbor lists).
291///
292/// Raised from 5 → 25 in 4.1.3 (I#60): hub functions in real Rust/Python/Go
293/// corpora commonly have 10-25 outgoing call edges; the old cap of 5 caused
294/// every large-scale corpus file to report `truncated_calls ≥ 2` and left
295/// agents reading `get_repo_map.files[i].calls[]` with a sparse skeleton.
296///
297/// The serialisation-size cost is bounded by the existing `token_budget`
298/// allocator, which gates each file's call list against its byte allocation.
299/// BFS-internal reachability (`compute_dead_code`) uses the untruncated
300/// `def_edges` CSR directly (I#57/I#61) and is NOT affected by this constant.
301const MAX_NEIGHBORS: usize = 25;
302
303/// Maximum number of outgoing call entries rendered per file in the JSON response.
304///
305/// Applied at render time in [`render_json_budgeted`] as a hard cap on
306/// `calls[]` length (I#68, 4.1.4). Symmetric with the graph-build cap
307/// [`MAX_NEIGHBORS`] so agents always see up to 25 callees — matching the
308/// def-level cap introduced in I#60.
309///
310/// Replaces the logarithmic attenuation cutoff that was previously applied to
311/// file-level callees. Attenuation is appropriate for *symbol* lists (where
312/// rank distributions are informative) but pathological for *call-edge* lists:
313/// in real corpora callee base-ranks follow a geometric distribution, causing
314/// the attenuation to fire at pos=1 and collapse `calls[]` to a single entry.
315/// The byte-budget check is retained; this constant adds a count ceiling.
316const MAX_FILE_CALLS: usize = 25;
317
318/// Approximate characters per token for budget estimation.
319const CHARS_PER_TOKEN: usize = 4;
320
321/// Concentration mass placed on the focus node in topic-sensitive `PageRank`.
322///
323/// Following Haveliwala 2002 ("Topic-Sensitive PageRank"), the personalization
324/// vector places a bias `α` on the focus node and distributes the remaining
325/// `1 - α` uniformly over all other nodes. This preserves rank dispersion
326/// across the corpus — the user sees a *neighborhood* of related files
327/// rebiased toward the focus, not a Dirac delta on the focus node with
328/// every other file collapsed to an equally negligible uniform floor.
329///
330/// Value 0.35 means:
331/// - focus node teleportation probability = 0.35
332/// - each of the (n - 1) other nodes = 0.65 / (n - 1)
333///
334/// Calibration history:
335/// - Pre-4.0.5: α = 0.70 → winner-take-all collapse (flask focus = 0.703,
336/// all others ≈ 0.003); fixed under I#16.
337/// - 4.0.5 → 4.1.11: α = 0.15 → preserved dispersion but underbiased on
338/// real corpora: flask focus blueprints.py landed at position #5-#7
339/// instead of top-3 because structural hubs (helpers.py, app.py)
340/// dominated.
341/// - 4.1.12+: α = 0.35 → focus reliably surfaces in top-3 on flask
342/// (rank ~0.10 vs hub rank ~0.10), small-graph dispersion tests still
343/// pass (n=10 star dispersion ratio remains under the 40× ceiling),
344/// J2 file-count parity relaxed from 80% to 70% as the unavoidable
345/// trade-off for stronger focus bias.
346const PERSONALIZATION_ALPHA: f32 = 0.35;
347
348// ── Import Queries ───────────────────────────────────────────────────
349
350/// Compile a tree-sitter import query for the given extension.
351///
352/// Returns `None` for unsupported extensions.
353fn import_query_for_extension(ext: &str) -> Option<(tree_sitter::Language, Query)> {
354 let (lang, query_str): (tree_sitter::Language, &str) = match ext {
355 "rs" => (
356 tree_sitter_rust::LANGUAGE.into(),
357 "(use_declaration) @import",
358 ),
359 "py" | "pyi" => (
360 tree_sitter_python::LANGUAGE.into(),
361 concat!(
362 "(import_statement) @import\n",
363 "(import_from_statement) @import",
364 ),
365 ),
366 "js" | "jsx" => (
367 tree_sitter_javascript::LANGUAGE.into(),
368 "(import_statement source: (string) @import_path) @import",
369 ),
370 "ts" => (
371 tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into(),
372 "(import_statement source: (string) @import_path) @import",
373 ),
374 "tsx" => (
375 tree_sitter_typescript::LANGUAGE_TSX.into(),
376 "(import_statement source: (string) @import_path) @import",
377 ),
378 "go" => (
379 tree_sitter_go::LANGUAGE.into(),
380 "(import_spec path: (interpreted_string_literal) @import_path) @import",
381 ),
382 // Ruby: require statements.
383 "rb" => (
384 tree_sitter_ruby::LANGUAGE.into(),
385 "(call method: (identifier) @_method arguments: (argument_list (string (string_content) @import_path)) (#eq? @_method \"require\")) @import",
386 ),
387 _ => return None,
388 };
389 let query = match Query::new(&lang, query_str) {
390 Ok(q) => q,
391 Err(e) => {
392 tracing::warn!(ext, %e, "import query compilation failed — language may be ABI-incompatible");
393 return None;
394 }
395 };
396 Some((lang, query))
397}
398
399/// Extract import paths from source using tree-sitter.
400fn extract_imports(
401 source: &str,
402 lang: &tree_sitter::Language,
403 import_query: &Query,
404) -> Vec<String> {
405 let mut parser = Parser::new();
406 if parser.set_language(lang).is_err() {
407 return vec![];
408 }
409 let Some(tree) = parser.parse(source, None) else {
410 return vec![];
411 };
412
413 let mut cursor = QueryCursor::new();
414 let mut imports = Vec::new();
415 let mut matches = cursor.matches(import_query, tree.root_node(), source.as_bytes());
416
417 while let Some(m) = matches.next() {
418 // Prefer @import_path capture (JS/TS/Go), fall back to full @import text
419 let mut import_path_text = None;
420 let mut import_text = None;
421
422 for cap in m.captures {
423 let cap_name = &import_query.capture_names()[cap.index as usize];
424 let text = &source[cap.node.start_byte()..cap.node.end_byte()];
425 if *cap_name == "import_path" {
426 import_path_text = Some(text.trim_matches(|c| c == '"' || c == '\''));
427 } else if *cap_name == "import" {
428 import_text = Some(text);
429 }
430 }
431
432 if let Some(path) = import_path_text {
433 imports.push(path.to_string());
434 } else if let Some(text) = import_text {
435 imports.push(text.to_string());
436 }
437 }
438
439 imports
440}
441
442// ── Import Resolution ────────────────────────────────────────────────
443
444/// Resolve a Rust `use` path to a file index in the file map.
445///
446/// Handles `crate::`, `self::`, and `super::` prefixes. External crate
447/// imports are dropped (returns `None`).
448fn resolve_rust_import(
449 raw: &str,
450 file_path: &Path,
451 root: &Path,
452 file_index: &HashMap<PathBuf, usize>,
453) -> Option<usize> {
454 // Extract the module path from `use crate::foo::bar;` or `use crate::foo::bar::Baz;`
455 let trimmed = raw
456 .trim()
457 .trim_start_matches("use ")
458 .trim_end_matches(';')
459 .trim();
460
461 let segments: Vec<&str> = trimmed.split("::").collect();
462 if segments.is_empty() {
463 return None;
464 }
465
466 // Determine the base directory and skip prefix segments
467 let (base, skip) = match segments[0] {
468 "crate" => {
469 // Find the nearest Cargo.toml ancestor to determine the crate root.
470 // In a workspace, `crate::foo` resolves relative to the crate's src/,
471 // not the workspace root.
472 let mut dir = file_path.parent();
473 let crate_root = loop {
474 match dir {
475 Some(d) if d.join("Cargo.toml").exists() => break d.join("src"),
476 Some(d) => dir = d.parent(),
477 None => break root.join("src"), // fallback
478 }
479 };
480 (crate_root, 1)
481 }
482 "self" => {
483 let dir = file_path.parent()?;
484 (dir.to_path_buf(), 1)
485 }
486 "super" => {
487 let dir = file_path.parent()?.parent()?;
488 (dir.to_path_buf(), 1)
489 }
490 // External crate — drop
491 _ => return None,
492 };
493
494 // Build candidate paths from the remaining segments.
495 // Try progressively shorter prefixes since the last segments
496 // may be items (struct, fn) rather than modules.
497 let path_segments = &segments[skip..];
498 for end in (1..=path_segments.len()).rev() {
499 let mut candidate = base.clone();
500 for seg in &path_segments[..end] {
501 // Strip glob patterns like `{Foo, Bar}`
502 let clean = seg.split('{').next().unwrap_or(seg).trim();
503 if !clean.is_empty() {
504 candidate.push(clean);
505 }
506 }
507
508 // Try file.rs
509 let as_file = candidate.with_extension("rs");
510 if let Some(&idx) = file_index.get(&as_file) {
511 return Some(idx);
512 }
513
514 // Try dir/mod.rs
515 let as_mod = candidate.join("mod.rs");
516 if let Some(&idx) = file_index.get(&as_mod) {
517 return Some(idx);
518 }
519 }
520
521 None
522}
523
524/// Resolve an import path to a file index based on file extension.
525fn resolve_import(
526 raw: &str,
527 ext: &str,
528 file_path: &Path,
529 root: &Path,
530 file_index: &HashMap<PathBuf, usize>,
531) -> Option<usize> {
532 match ext {
533 "rs" => resolve_rust_import(raw, file_path, root, file_index),
534 "py" | "pyi" => resolve_python_import(raw, root, file_index),
535 "js" | "jsx" | "ts" | "tsx" => resolve_js_import(raw, file_path, file_index),
536 // Go imports use full package paths — skip local resolution
537 _ => None,
538 }
539}
540
541/// Resolve a Python import to a file index.
542///
543/// Handles `import foo.bar` and `from foo.bar import baz` patterns.
544fn resolve_python_import(
545 raw: &str,
546 root: &Path,
547 file_index: &HashMap<PathBuf, usize>,
548) -> Option<usize> {
549 let module_path = if let Some(rest) = raw.strip_prefix("from ") {
550 rest.split_whitespace().next()?
551 } else if let Some(rest) = raw.strip_prefix("import ") {
552 rest.split_whitespace().next()?
553 } else {
554 return None;
555 };
556
557 let rel_path: PathBuf = module_path.split('.').collect();
558 for ext in ["py", "pyi"] {
559 let as_file = root.join(&rel_path).with_extension(ext);
560 if let Some(&idx) = file_index.get(&as_file) {
561 return Some(idx);
562 }
563 }
564
565 for init_name in ["__init__.py", "__init__.pyi"] {
566 let as_init = root.join(&rel_path).join(init_name);
567 if let Some(&idx) = file_index.get(&as_init) {
568 return Some(idx);
569 }
570 }
571
572 None
573}
574
575/// Resolve a JS/TS import to a file index.
576///
577/// Handles relative paths like `./foo` or `../bar`.
578fn resolve_js_import(
579 raw: &str,
580 file_path: &Path,
581 file_index: &HashMap<PathBuf, usize>,
582) -> Option<usize> {
583 if !raw.starts_with('.') {
584 return None;
585 }
586
587 let dir = file_path.parent()?;
588 let candidate = dir.join(raw);
589
590 for ext in &["js", "jsx", "ts", "tsx"] {
591 let with_ext = candidate.with_extension(ext);
592 if let Some(&idx) = file_index.get(&with_ext) {
593 return Some(idx);
594 }
595 }
596
597 for ext in &["js", "jsx", "ts", "tsx"] {
598 let index_file = candidate.join("index").with_extension(ext);
599 if let Some(&idx) = file_index.get(&index_file) {
600 return Some(idx);
601 }
602 }
603
604 None
605}
606
607// ── Extraction ───────────────────────────────────────────────────────
608
609/// Extract the name of the first decorator from a `decorated_definition` node.
610///
611/// Mirrors the private `languages::first_decorator_ident` but inlined here
612/// because that function is private to the `languages` module.
613///
614/// For simple `@name` decorators (e.g., `@property`, `@classmethod`), returns
615/// `Some("property")` or `Some("classmethod")`.
616///
617/// For attribute-access decorators (e.g., `@functools.lru_cache`), returns
618/// `Some("functools.lru_cache")` — the full dotted-name text.
619///
620/// For call-expression decorators (e.g., `@app.route("/")`), returns `None`.
621fn extract_first_decorator_name(node: &tree_sitter::Node<'_>, source: &[u8]) -> Option<String> {
622 let mut cursor = node.walk();
623 for child in node.children(&mut cursor) {
624 if child.kind() == "decorator" {
625 let mut inner = child.walk();
626 for inner_child in child.children(&mut inner) {
627 match inner_child.kind() {
628 // Simple name (@property) or attribute access (@functools.lru_cache) —
629 // return the full text so callers can store it for display.
630 "identifier" | "attribute" => {
631 return std::str::from_utf8(
632 &source[inner_child.start_byte()..inner_child.end_byte()],
633 )
634 .ok()
635 .map(str::to_owned);
636 }
637 // Call expression — ambiguous; treat as None.
638 "call" => return None,
639 _ => {}
640 }
641 }
642 return None;
643 }
644 }
645 None
646}
647
648/// Determine whether a Python `(assignment)` or JS/TS `(variable_declarator)`
649/// def captured by the def-query is spuriously nested inside a function body.
650///
651/// Cycle 10 W1 Front A — the Python def-query at `languages.rs:637` and the
652/// JS/TS variants at `:647`, `:657`, `:669` capture every `x = foo()` or
653/// `const x = foo()` site, even those inside function bodies, as defs. The
654/// `extract_calls` smallest-enclosing rule (`repo_map.rs:827-832`) then steals
655/// every call out of the surrounding function into the nested assignment def.
656/// Result: BFS terminates early because function defs have no outgoing edges.
657///
658/// Fix: walk up the AST from the def node. If we hit a function-body context
659/// (Python `function_definition`, JS/TS `function_declaration`,
660/// `function_expression`, `arrow_function`, `method_definition`,
661/// `generator_function_declaration`, `generator_function`) before reaching the
662/// module root, the def is spurious and must be dropped.
663///
664/// Module-level constants (`MAX = 4096`) and class-attribute assignments
665/// (`class Foo: bar = make_bar()`) — both of which are legitimate defs — are
666/// preserved because their ancestor chain contains only `module` /
667/// `class_definition` / `class_body` / `block` / `program` nodes, never a
668/// function-body container.
669fn is_spurious_nested_binding_def(kind: &str, node: tree_sitter::Node<'_>) -> bool {
670 // Only the assignment / variable_declarator captures are at risk.
671 // Other def kinds (function_definition, class_definition, method_definition,
672 // function_declaration, class_declaration, type_alias_declaration, etc.)
673 // are always legitimate at any nesting depth (e.g. nested functions,
674 // methods inside classes).
675 if !matches!(kind, "assignment" | "variable_declarator") {
676 return false;
677 }
678 let mut cur = node.parent();
679 while let Some(parent) = cur {
680 match parent.kind() {
681 // Function-body containers: any def captured beneath one of these
682 // is a local-variable binding, not a module/class-level def.
683 "function_definition" // Python
684 | "function_declaration" // JS / TS
685 | "function_expression" // JS / TS
686 | "arrow_function" // JS / TS
687 | "method_definition" // JS / TS
688 | "generator_function_declaration"
689 | "generator_function" => return true,
690 // Top-level containers — we reached the module without finding a
691 // function ancestor. The def is legitimate.
692 "module" | "program" => return false,
693 _ => {}
694 }
695 cur = parent.parent();
696 }
697 false
698}
699
700/// Extract definitions from a source file using tree-sitter.
701fn extract_definitions(source: &str, config: &languages::LangConfig) -> Vec<Definition> {
702 let mut parser = Parser::new();
703 if parser.set_language(&config.language).is_err() {
704 return vec![];
705 }
706 let Some(tree) = parser.parse(source, None) else {
707 return vec![];
708 };
709
710 let mut cursor = QueryCursor::new();
711 let mut defs = Vec::new();
712 let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
713
714 while let Some(m) = matches.next() {
715 let mut name = String::new();
716 let mut def_node = None;
717
718 for cap in m.captures {
719 let cap_name = &config.query.capture_names()[cap.index as usize];
720 if *cap_name == "name" {
721 name = source[cap.node.start_byte()..cap.node.end_byte()].to_string();
722 } else if *cap_name == "def" {
723 def_node = Some(cap.node);
724 }
725 }
726
727 if let Some(node) = def_node {
728 // Cycle 10 W1 Front A: drop spurious nested assignment / variable_declarator
729 // defs that would steal calls from the enclosing function.
730 if is_spurious_nested_binding_def(node.kind(), node) {
731 continue;
732 }
733 let scope = crate::chunk::build_scope_chain(node, source);
734 let signature = crate::chunk::extract_signature(node, source);
735 #[expect(clippy::cast_possible_truncation, reason = "line numbers fit in u32")]
736 let start_line = node.start_position().row as u32 + 1;
737 #[expect(clippy::cast_possible_truncation, reason = "line numbers fit in u32")]
738 let end_line = node.end_position().row as u32 + 1;
739 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
740 let start_byte = node.start_byte() as u32;
741 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
742 let end_byte = node.end_byte() as u32;
743 // C1 (4.1.1): For Python `decorated_definition` nodes, extract the
744 // decorator name and compute the LSP kind at AST-parse time so the
745 // projection site in `render_json_budgeted` does not need to re-parse.
746 let (decorator, lsp_kind_hint) = if node.kind() == "decorated_definition" {
747 let dec = extract_first_decorator_name(&node, source.as_bytes());
748 let kind_hint = languages::lsp_symbol_kind_for_decorated_definition(
749 dec.as_deref().unwrap_or(""),
750 );
751 (dec, Some(kind_hint))
752 } else {
753 (None, None)
754 };
755 defs.push(Definition {
756 name,
757 kind: node.kind().to_string(),
758 start_line,
759 end_line,
760 scope,
761 signature,
762 start_byte,
763 end_byte,
764 calls: vec![],
765 decorator,
766 lsp_kind_hint,
767 });
768 }
769 }
770
771 defs
772}
773
774// ── Call Extraction & Resolution ────────────────────────────────────
775
776/// Tiebreak priority for def attribution when two defs share the same byte span.
777///
778/// Returns `0` for function-like defs (lowest value = wins in `min_by_key`) and
779/// `1` for structural container defs (class bodies, impl blocks, etc.).
780///
781/// This resolves the Python case where the class body `block` and the first
782/// `function_definition` inside it occupy identical byte ranges; calls inside
783/// the function body should be attributed to the function, not the class block.
784fn is_callable_def_priority(kind: &str) -> u8 {
785 match kind {
786 // Function / method defs: these are the correct attribution targets.
787 "function_item"
788 | "function_definition"
789 | "function_declaration"
790 | "function_signature_item"
791 | "method_definition"
792 | "method_declaration"
793 | "method" => 0,
794 // Structural containers: class body blocks, impl items, etc.
795 // Prefer function-like defs over these when byte ranges tie.
796 _ => 1,
797 }
798}
799
800/// Extract call sites from a source file and assign them to definitions.
801///
802/// Uses the language's call query to find all call expressions, then
803/// assigns each call to the definition whose byte range contains it.
804/// Calls outside any definition body (module-level) are ignored.
805///
806/// For Rust scoped calls (`a::b::foo()`), the `@callee` capture returns the
807/// full `scoped_identifier` node. This function splits it into:
808/// - `name` = bare trailing identifier (`"foo"`)
809/// - `qualified_path` = `Some("a::b::foo")` for disambiguation in `resolve_calls`.
810///
811/// For method calls (`x.method()`), `receiver_type` is inferred from local
812/// context (parameter annotations, let-bindings, impl blocks). See
813/// [`infer_receiver_types`] for the heuristic.
814fn extract_calls(source: &str, call_config: &languages::CallConfig, defs: &mut [Definition]) {
815 let mut parser = Parser::new();
816 if parser.set_language(&call_config.language).is_err() {
817 return;
818 }
819 let Some(tree) = parser.parse(source, None) else {
820 return;
821 };
822
823 // Build receiver-type map: byte_offset_of_call → receiver_type_string.
824 // Done once per file to amortise the tree walk cost.
825 let receiver_map = infer_receiver_types(source, &tree, &call_config.language);
826
827 // HCL: run the HCL-specific call-edge extractor as a post-pass so the
828 // terraform_remote_state references and module blocks contribute edges
829 // that the generic function_call query cannot capture (R2 + R3, Wave 3).
830 if languages::is_hcl_language(&call_config.language) {
831 extract_hcl_call_edges(source, tree.root_node(), defs);
832 }
833
834 // C / C++ (I#55, 4.1.5): emit synthetic call-graph edges from
835 // struct-literal initializer fnptrs. C codebases dispatch via tables of
836 // function pointers (Linux `file_operations`, redis `redisCommandTable`,
837 // libuv handle vtables) — the generic call-expression query cannot
838 // capture these because there is no syntactic `f()` call site; the
839 // function is referenced by bare identifier inside `{ ... }`. Without
840 // this post-pass, every implementation referenced exclusively via such
841 // a table appears dead (Part XI §XI.4: kernel mega-cluster collapse on
842 // Linux, command-implementation collapse on redis).
843 if languages::is_c_language(&call_config.language)
844 || languages::is_cpp_language(&call_config.language)
845 {
846 extract_c_struct_init_edges(source, tree.root_node(), defs);
847 }
848
849 let mut cursor = QueryCursor::new();
850 let mut matches = cursor.matches(&call_config.query, tree.root_node(), source.as_bytes());
851
852 while let Some(m) = matches.next() {
853 let mut full_callee_text = None;
854 let mut call_byte = 0u32;
855
856 for cap in m.captures {
857 let cap_name = &call_config.query.capture_names()[cap.index as usize];
858 if *cap_name == "callee" {
859 full_callee_text =
860 Some(source[cap.node.start_byte()..cap.node.end_byte()].to_string());
861 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
862 {
863 call_byte = cap.node.start_byte() as u32;
864 }
865 }
866 }
867
868 if let Some(full_text) = full_callee_text {
869 // Split qualified path into bare name + optional qualifier.
870 let (name, qualified_path) = if full_text.contains("::") {
871 let bare = full_text
872 .rsplit("::")
873 .next()
874 .unwrap_or(&full_text)
875 .to_string();
876 (bare, Some(full_text))
877 } else {
878 (full_text, None)
879 };
880
881 // Look up receiver type from the pre-built map.
882 let receiver_type = receiver_map.get(&call_byte).cloned();
883
884 // Assign to the most-specific (smallest byte range) enclosing definition.
885 // Using `find` (first match) was incorrect for nested defs: an `impl_item`
886 // wrapping a `function_item` both contain the call site, but the
887 // `function_item` is the correct granularity for method attribution.
888 //
889 // Tiebreak: when two defs have equal byte spans (as happens in Python where
890 // the class body `block` and its first `function_definition` share the same
891 // start/end bytes), prefer function-like defs over structural container defs.
892 // `is_callable_def` returns 0 for function-like kinds (sorts first in min_by_key).
893 let enclosing_idx = defs
894 .iter()
895 .enumerate()
896 .filter(|(_, d)| d.start_byte <= call_byte && call_byte < d.end_byte)
897 .min_by_key(|(_, d)| (d.end_byte - d.start_byte, is_callable_def_priority(&d.kind)))
898 .map(|(i, _)| i);
899
900 if let Some(idx) = enclosing_idx {
901 // Skip self-recursive calls (compare bare name to def name).
902 if defs[idx].name != name {
903 defs[idx].calls.push(CallRef {
904 name,
905 qualified_path,
906 receiver_type,
907 byte_offset: call_byte,
908 resolved: None,
909 });
910 }
911 }
912 // Calls outside any definition are ignored (module-level init).
913 }
914 }
915
916 // JS / TS / TSX (B-0005): attribute closure-argument call edges to the
917 // nearest named enclosing function. Arrow functions and function
918 // expressions passed as arguments (e.g. `useCallback(() => fn(), [deps])`
919 // or `setTimeout(() => fn(), 0)`) are "passthrough" scopes — their inner
920 // calls bubble up to the nearest `function_declaration` or
921 // `method_definition` ancestor. Named const-assigned arrows (`const f =
922 // () => ...`) are NOT passthrough; they are their own def targets.
923 //
924 // This runs as a post-pass so that the main-loop dedup check for existing
925 // (byte_offset, name) pairs can prevent duplicates from being emitted when
926 // a call is already attributed to the correct named function.
927 if is_js_or_ts_language(&call_config.language) {
928 extract_js_closure_call_edges(source, tree.root_node(), defs);
929 }
930}
931
932// ── JS / TS closure call-edge attribution ────────────────────────────────────
933
934/// Returns `true` if `lang` is one of the JavaScript / TypeScript grammars.
935///
936/// Uses the same node-kind-count proxy as [`languages::is_rust_language`].
937/// JS, TS, and TSX all need closure attribution (B-0005).
938fn is_js_or_ts_language(lang: &tree_sitter::Language) -> bool {
939 let js_lang: tree_sitter::Language = tree_sitter_javascript::LANGUAGE.into();
940 let ts_lang: tree_sitter::Language = tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into();
941 let tsx_lang: tree_sitter::Language = tree_sitter_typescript::LANGUAGE_TSX.into();
942 let matches = |reference: tree_sitter::Language| {
943 lang.abi_version() == reference.abi_version()
944 && lang.node_kind_count() == reference.node_kind_count()
945 };
946 matches(js_lang) || matches(ts_lang) || matches(tsx_lang)
947}
948
949/// Returns `true` if `node` is an `arrow_function` or `function_expression`
950/// that is passed directly as an argument to another call expression.
951///
952/// The check is: the node's immediate parent is an `arguments` node.
953/// This distinguishes "argument closures" (passthrough for call attribution)
954/// from "named const arrows" (`const f = () => ...`) whose parent is a
955/// `variable_declarator`.
956fn is_argument_closure(node: tree_sitter::Node<'_>) -> bool {
957 matches!(node.kind(), "arrow_function" | "function_expression")
958 && node.parent().is_some_and(|p| p.kind() == "arguments")
959}
960
961/// Walk up the tree from `node`, passing through all transparent nodes
962/// (`arrow_function`, `function_expression`, `statement_block`, `arguments`,
963/// `call_expression`, `variable_declarator`, `lexical_declaration`,
964/// `expression_statement`, etc.) until we reach a `function_declaration` or
965/// `method_definition`, which is the nearest named enclosing function.
966///
967/// Returns `None` if no named function is found above `node` (e.g. the
968/// closure lives at module/file scope).
969fn nearest_named_fn_ancestor(node: tree_sitter::Node<'_>) -> Option<tree_sitter::Node<'_>> {
970 let mut current = node.parent()?;
971 loop {
972 match current.kind() {
973 // Named function boundaries: stop here.
974 "function_declaration" | "method_definition" => return Some(current),
975 // All other node kinds are transparent: keep climbing.
976 //
977 // The explicit list of transparent kinds (closures, declarations,
978 // control-flow, etc.) is documented here for readability, but the
979 // wildcard covers any new node kinds added in future grammar
980 // versions — the walk is intentionally permissive so that
981 // attribution never silently breaks on grammar updates.
982 //
983 // Transparent: "arrow_function", "function_expression",
984 // "statement_block", "arguments", "call_expression",
985 // "member_expression", "variable_declarator", "lexical_declaration",
986 // "expression_statement", "return_statement", "await_expression",
987 // "class_body", "class_declaration", "export_statement",
988 // "if_statement", "while_statement", "for_statement", etc.
989 _ => {}
990 }
991 match current.parent() {
992 Some(p) => current = p,
993 None => return None,
994 }
995 }
996}
997
998/// Collect all `call_expression` callee names that appear directly inside
999/// `closure` (an `arrow_function` or `function_expression`), visiting only
1000/// one level of closure depth (recursive closures are handled by the outer
1001/// DFS in [`extract_js_closure_call_edges`]).
1002///
1003/// Returns a `Vec` of `(callee_name, byte_offset)` pairs matching the callee
1004/// capture of the JS call query: either a bare `identifier` or the
1005/// `property_identifier` from a `member_expression`.
1006fn collect_calls_in_closure<'a>(
1007 source: &'a str,
1008 closure: tree_sitter::Node<'a>,
1009) -> Vec<(String, u32)> {
1010 let mut results = Vec::new();
1011 let mut stack: Vec<tree_sitter::Node<'_>> = Vec::new();
1012 // Start from the closure body, not the closure node itself.
1013 let body = closure.child_by_field_name("body").unwrap_or(closure);
1014 let mut cursor = body.walk();
1015 for child in body.children(&mut cursor) {
1016 stack.push(child);
1017 }
1018
1019 while let Some(node) = stack.pop() {
1020 if node.kind() == "call_expression" {
1021 // Extract callee: either `function: identifier` or
1022 // `function: member_expression property: property_identifier`.
1023 if let Some(fn_node) = node.child_by_field_name("function") {
1024 let callee_opt: Option<(String, u32)> = match fn_node.kind() {
1025 "identifier" => {
1026 let name = source[fn_node.start_byte()..fn_node.end_byte()].to_string();
1027 #[expect(
1028 clippy::cast_possible_truncation,
1029 reason = "byte offsets fit in u32"
1030 )]
1031 let byte = fn_node.start_byte() as u32;
1032 Some((name, byte))
1033 }
1034 "member_expression" => fn_node.child_by_field_name("property").map(|prop| {
1035 let name = source[prop.start_byte()..prop.end_byte()].to_string();
1036 #[expect(
1037 clippy::cast_possible_truncation,
1038 reason = "byte offsets fit in u32"
1039 )]
1040 let byte = prop.start_byte() as u32;
1041 (name, byte)
1042 }),
1043 _ => None,
1044 };
1045 if let Some(pair) = callee_opt {
1046 results.push(pair);
1047 }
1048 }
1049 }
1050 // Recurse into children, but do NOT descend into nested
1051 // arrow_function / function_expression nodes — those are either
1052 // handled by the outer DFS (if they are argument closures) or are
1053 // named const-arrows (which are their own defs and handled by the
1054 // main loop).
1055 if !matches!(node.kind(), "arrow_function" | "function_expression") {
1056 let mut c = node.walk();
1057 for child in node.children(&mut c) {
1058 stack.push(child);
1059 }
1060 }
1061 }
1062 results
1063}
1064
1065/// Post-pass for JS / TS / TSX files.
1066///
1067/// Two sub-passes:
1068///
1069/// **Pass 1 — closure-argument attribution**: For every `arrow_function` or
1070/// `function_expression` that is passed as an argument to a call (i.e., its
1071/// parent is an `arguments` node), attributes the inner calls to the nearest
1072/// named enclosing `function_declaration` or `method_definition`.
1073///
1074/// **Pass 2 — variable-declarator propagation**: For every `variable_declarator`
1075/// def that sits inside a named function, propagates all its calls to the
1076/// enclosing named function. This captures the outer call itself (e.g.
1077/// `useCallback(...)` in `const handler = useCallback(...)`) which the main
1078/// loop attributed to `handler` (the smallest enclosing def) but which also
1079/// belongs to the enclosing React component.
1080///
1081/// Both passes use `(byte_offset, name)` deduplication so calls are never
1082/// duplicated on a single def.
1083///
1084/// This fixes B-0005: React hooks (`useCallback`, `useMemo`, `useEffect`),
1085/// timer APIs (`setTimeout`, `setInterval`), array methods
1086/// (`.map`, `.filter`, `.forEach`, `.reduce`), and Express middleware
1087/// (`app.use`, `app.get`) all pass closures as arguments. Without this
1088/// post-pass, every function called INSIDE such a closure appears dead
1089/// because the call edge is attributed to the anonymous closure (which has
1090/// no def of its own) rather than to the enclosing named function.
1091///
1092/// **Rule** (from DESIGN §B-0005): anonymous closures and arrows are
1093/// "passthrough". Walk up to the nearest `function_declaration` /
1094/// `method_definition`. If none is found (closure at module scope), no edge
1095/// is emitted — matching the existing behaviour for top-level JS calls.
1096fn extract_js_closure_call_edges(
1097 source: &str,
1098 root: tree_sitter::Node<'_>,
1099 defs: &mut [Definition],
1100) {
1101 // Pass 1: walk tree — attribute calls inside argument-closure bodies to
1102 // the nearest named enclosing function.
1103 {
1104 let mut stack: Vec<tree_sitter::Node<'_>> = vec![root];
1105 while let Some(node) = stack.pop() {
1106 if is_argument_closure(node) {
1107 // Find the nearest named function ancestor.
1108 if let Some(named_fn) = nearest_named_fn_ancestor(node) {
1109 // Identify which def this named function corresponds to by
1110 // matching byte range and kind.
1111 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
1112 let fn_start = named_fn.start_byte() as u32;
1113 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
1114 let fn_end = named_fn.end_byte() as u32;
1115
1116 let def_idx = defs.iter().position(|d| {
1117 d.start_byte == fn_start
1118 && d.end_byte == fn_end
1119 && matches!(
1120 d.kind.as_str(),
1121 "function_declaration" | "method_definition"
1122 )
1123 });
1124
1125 if let Some(idx) = def_idx {
1126 // Collect calls inside this closure (one level deep;
1127 // nested closures are handled by their own DFS iteration).
1128 let inner_calls = collect_calls_in_closure(source, node);
1129 for (name, byte_offset) in inner_calls {
1130 // Skip self-recursive calls.
1131 if defs[idx].name == name {
1132 continue;
1133 }
1134 // Skip duplicate edges already present in the def.
1135 if defs[idx]
1136 .calls
1137 .iter()
1138 .any(|c| c.byte_offset == byte_offset && c.name == name)
1139 {
1140 continue;
1141 }
1142 defs[idx].calls.push(CallRef {
1143 name,
1144 qualified_path: None,
1145 receiver_type: None,
1146 byte_offset,
1147 resolved: None,
1148 });
1149 }
1150 }
1151 }
1152 }
1153 // Recurse into all children.
1154 let mut cursor = node.walk();
1155 for child in node.children(&mut cursor) {
1156 stack.push(child);
1157 }
1158 }
1159 }
1160
1161 // Pass 2: propagate calls from variable_declarator defs to their enclosing
1162 // named function defs.
1163 //
1164 // When a JS pattern like `const handler = useCallback(...)` creates a
1165 // `variable_declarator` def named `handler`, the main loop attributes
1166 // calls like `useCallback(...)` to `handler` (the smallest enclosing def).
1167 // Those calls also semantically belong to the enclosing named function
1168 // (e.g. `Component`) because they execute in its runtime context.
1169 //
1170 // We collect (variable_declarator_idx, enclosing_named_fn_idx) pairs first
1171 // to avoid mutating `defs` while iterating over it.
1172 let propagation_pairs: Vec<(usize, usize)> = defs
1173 .iter()
1174 .enumerate()
1175 .filter(|(_, d)| d.kind == "variable_declarator")
1176 .filter_map(|(var_idx, var_def)| {
1177 // Find the smallest named function def that strictly contains this
1178 // variable_declarator (not equal byte range — strict containment).
1179 let enclosing = defs
1180 .iter()
1181 .enumerate()
1182 .filter(|(enc_idx, enc)| {
1183 *enc_idx != var_idx
1184 && matches!(
1185 enc.kind.as_str(),
1186 "function_declaration" | "method_definition"
1187 )
1188 && enc.start_byte <= var_def.start_byte
1189 && var_def.end_byte <= enc.end_byte
1190 })
1191 .min_by_key(|(_, enc)| enc.end_byte - enc.start_byte)
1192 .map(|(i, _)| i);
1193 enclosing.map(|enc_idx| (var_idx, enc_idx))
1194 })
1195 .collect();
1196
1197 for (var_idx, enc_idx) in propagation_pairs {
1198 // Clone the calls to propagate (avoiding borrow-checker conflict).
1199 let calls_to_propagate: Vec<(String, u32)> = defs[var_idx]
1200 .calls
1201 .iter()
1202 .map(|c| (c.name.clone(), c.byte_offset))
1203 .collect();
1204 for (name, byte_offset) in calls_to_propagate {
1205 // Skip self-recursive edges.
1206 if defs[enc_idx].name == name {
1207 continue;
1208 }
1209 // Skip duplicates.
1210 if defs[enc_idx]
1211 .calls
1212 .iter()
1213 .any(|c| c.byte_offset == byte_offset && c.name == name)
1214 {
1215 continue;
1216 }
1217 defs[enc_idx].calls.push(CallRef {
1218 name,
1219 qualified_path: None,
1220 receiver_type: None,
1221 byte_offset,
1222 resolved: None,
1223 });
1224 }
1225 }
1226}
1227
1228// HCL: post-pass call-edge extraction for terraform_remote_state and module
1229// blocks. These are not function calls — they are HCL-specific structural
1230// references to other Terraform modules — so the generic
1231// `(function_call (identifier) @callee) @call` pattern in
1232// `call_query_for_extension("tf")` cannot capture them. This helper runs
1233// once per HCL file inside `extract_calls` (R2 + R3, Wave 3).
1234
1235/// Walk an HCL parse tree and emit CallRef entries for:
1236///
1237/// 1. `data.terraform_remote_state.<NAME>.outputs.<ATTR>` expressions:
1238/// one CallRef per reference, with `name = NAME` and
1239/// `qualified_path = Some("terraform_remote_state.NAME")`. These
1240/// connect the current file to the named remote-state module's outputs.
1241///
1242/// 2. `module "X" { source = "../X" }` blocks: one CallRef with
1243/// `name = X` (the label) and `qualified_path = Some("module.X")`.
1244/// The module reference connects to the module's directory in
1245/// `resolve_import` (HCL module-source resolution is not implemented
1246/// yet — the qualified_path carrier is the contract; resolve adds the
1247/// file lookup).
1248///
1249/// Each emitted CallRef is attached to the smallest enclosing definition
1250/// by byte range — matching the same heuristic used by `extract_calls`.
1251fn extract_hcl_call_edges(source: &str, root: tree_sitter::Node<'_>, defs: &mut [Definition]) {
1252 // Walk all named descendants iteratively.
1253 let mut stack: Vec<tree_sitter::Node<'_>> = vec![root];
1254 while let Some(node) = stack.pop() {
1255 // Defer to a function-style helper per node kind.
1256 match node.kind() {
1257 "expression" => hcl_visit_expression(source, node, defs),
1258 "block" => hcl_visit_block(source, node, defs),
1259 _ => {}
1260 }
1261 // Recurse into named children.
1262 let mut cursor = node.walk();
1263 for child in node.children(&mut cursor) {
1264 if child.is_named() {
1265 stack.push(child);
1266 }
1267 }
1268 }
1269}
1270
1271/// Inspect an HCL `expression` node for the
1272/// `data.terraform_remote_state.<NAME>.outputs.<ATTR>` reference pattern.
1273///
1274/// The expression tree looks like:
1275/// ```text
1276/// expression
1277/// variable_expr
1278/// identifier "data"
1279/// get_attr
1280/// identifier "terraform_remote_state"
1281/// get_attr
1282/// identifier "<NAME>"
1283/// get_attr
1284/// identifier "outputs"
1285/// get_attr
1286/// identifier "<ATTR>"
1287/// ```
1288fn hcl_visit_expression(source: &str, node: tree_sitter::Node<'_>, defs: &mut [Definition]) {
1289 // Collect children: must be `variable_expr` (with identifier="data")
1290 // followed by a chain of `get_attr` nodes (each with an `identifier` child).
1291 let mut cursor = node.walk();
1292 let mut child_iter = node.children(&mut cursor);
1293 let Some(first) = child_iter.next() else {
1294 return;
1295 };
1296 if first.kind() != "variable_expr" {
1297 return;
1298 }
1299 let Some(first_id) = first.child_by_field_name("name").or_else(|| {
1300 // Fallback: find first named child that's an identifier.
1301 let mut c = first.walk();
1302 first.children(&mut c).find(|n| n.kind() == "identifier")
1303 }) else {
1304 return;
1305 };
1306 if &source[first_id.start_byte()..first_id.end_byte()] != "data" {
1307 return;
1308 }
1309
1310 // Collect identifiers from the chain of get_attr.
1311 let mut chain: Vec<String> = Vec::new();
1312 for child in child_iter {
1313 if child.kind() != "get_attr" {
1314 return; // not a pure attribute chain
1315 }
1316 let mut gc = child.walk();
1317 let id = child.children(&mut gc).find(|n| n.kind() == "identifier");
1318 let Some(id_node) = id else { return };
1319 chain.push(source[id_node.start_byte()..id_node.end_byte()].to_string());
1320 }
1321
1322 // Expect: terraform_remote_state, <NAME>, outputs, <ATTR>
1323 if chain.len() < 2 || chain[0] != "terraform_remote_state" {
1324 return;
1325 }
1326 let name = chain[1].clone();
1327 let qualified_path = format!("terraform_remote_state.{name}");
1328
1329 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
1330 let call_byte = node.start_byte() as u32;
1331 attach_hcl_call(defs, call_byte, name, Some(qualified_path));
1332}
1333
1334/// Inspect an HCL `block` node for the `module "X" { source = "../X" }`
1335/// pattern. Emits one CallRef per matching block.
1336fn hcl_visit_block(source: &str, node: tree_sitter::Node<'_>, defs: &mut [Definition]) {
1337 // First child must be identifier="module".
1338 let mut cursor = node.walk();
1339 let children: Vec<tree_sitter::Node<'_>> = node.children(&mut cursor).collect();
1340 let Some(first) = children.first() else {
1341 return;
1342 };
1343 if first.kind() != "identifier" || &source[first.start_byte()..first.end_byte()] != "module" {
1344 return;
1345 }
1346 // Next child should be a string_lit (the module label).
1347 let label_node = children.iter().find(|c| c.kind() == "string_lit");
1348 let Some(label_node) = label_node else {
1349 return;
1350 };
1351 let mut lc = label_node.walk();
1352 let template = label_node
1353 .children(&mut lc)
1354 .find(|n| n.kind() == "template_literal");
1355 let Some(template) = template else {
1356 return;
1357 };
1358 let label = source[template.start_byte()..template.end_byte()].to_string();
1359 let qualified_path = format!("module.{label}");
1360
1361 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
1362 let call_byte = node.start_byte() as u32;
1363 attach_hcl_call(defs, call_byte, label, Some(qualified_path));
1364}
1365
1366/// Attach a synthesized HCL CallRef to the smallest enclosing definition.
1367/// Mirrors the byte-range attribution from `extract_calls`.
1368fn attach_hcl_call(
1369 defs: &mut [Definition],
1370 call_byte: u32,
1371 name: String,
1372 qualified_path: Option<String>,
1373) {
1374 let enclosing_idx = defs
1375 .iter()
1376 .enumerate()
1377 .filter(|(_, d)| d.start_byte <= call_byte && call_byte < d.end_byte)
1378 .min_by_key(|(_, d)| (d.end_byte - d.start_byte, is_callable_def_priority(&d.kind)))
1379 .map(|(i, _)| i);
1380 if let Some(idx) = enclosing_idx {
1381 // Skip self-recursive emission (would happen if the enclosing def
1382 // happens to share the same `name` as the synthesized callee).
1383 if defs[idx].name != name {
1384 defs[idx].calls.push(CallRef {
1385 name,
1386 qualified_path,
1387 receiver_type: None,
1388 byte_offset: call_byte,
1389 resolved: None,
1390 });
1391 }
1392 }
1393}
1394
1395/// Walk a C / C++ parse tree and emit synthetic CallRef entries for every
1396/// function identifier appearing inside a struct or array initializer
1397/// literal (I#55, 4.1.5).
1398///
1399/// Two syntactic forms produce edges:
1400///
1401/// 1. **Designated initializer** — `(initializer_pair value: (identifier))`
1402/// — corresponds to `.field = funcname` syntax (Linux `file_operations`,
1403/// ALSA `snd_pcm_ops`, etc):
1404/// ```c
1405/// static const struct file_operations my_fops = {
1406/// .read = my_read, // ← edge: my_fops → my_read
1407/// .write = my_write, // ← edge: my_fops → my_write
1408/// };
1409/// ```
1410///
1411/// 2. **Positional initializer** — `(initializer_list (identifier))` —
1412/// corresponds to bare identifier slots inside `{ ... }` (redis
1413/// `redisCommandTable`, libuv handle vtables):
1414/// ```c
1415/// struct redisCommand cmds[] = {
1416/// {"get", getCommand, 2}, // ← edge: cmds → getCommand
1417/// {"set", setCommand, -3}, // ← edge: cmds → setCommand
1418/// };
1419/// ```
1420///
1421/// Each emitted CallRef is attached to the smallest enclosing definition by
1422/// byte range (same heuristic as `extract_calls` / `attach_hcl_call`). For
1423/// the typical use case the enclosing def is the array/struct declaration
1424/// itself (e.g. `my_fops` or `cmds`). When the enclosing def is itself the
1425/// referenced function (e.g. a struct field designator inside a function
1426/// body) the self-recursive edge is suppressed.
1427///
1428/// Non-identifier initializer values (string literals, integers, nested
1429/// braces) are skipped by tree-sitter's `kind() == "identifier"` filter, so
1430/// no false-positive edges to undefined symbols are emitted. The resolver
1431/// (`resolve_calls`) then either binds the identifier to a real function
1432/// def (preserved as a real edge) or leaves `resolved = None` (dropped at
1433/// edge-construction time, matching how all unresolved CallRefs behave).
1434fn extract_c_struct_init_edges(source: &str, root: tree_sitter::Node<'_>, defs: &mut [Definition]) {
1435 // Iterative DFS to avoid stack-blowup on deeply-nested initializer
1436 // tables (some kernel drivers nest 4–5 levels of struct-of-struct).
1437 let mut stack: Vec<tree_sitter::Node<'_>> = vec![root];
1438 while let Some(node) = stack.pop() {
1439 match node.kind() {
1440 // Designated initializer: `.field = funcname`. The grammar
1441 // exposes the rhs as a positional child, but using the `value:`
1442 // field selector picks it unambiguously when present and falls
1443 // back to the last named child otherwise. We only consume bare
1444 // identifier values — string_literal, number_literal, and
1445 // nested initializer_list are dispatched separately (the
1446 // recursion below re-enters nested initializer_list nodes).
1447 "initializer_pair" => {
1448 let value_node = node.child_by_field_name("value").or_else(|| {
1449 // Fallback: last named child that isn't a designator.
1450 let mut c = node.walk();
1451 node.children(&mut c)
1452 .filter(|n| n.is_named() && n.kind() != "field_designator")
1453 .last()
1454 });
1455 if let Some(v) = value_node
1456 && v.kind() == "identifier"
1457 {
1458 emit_c_init_edge(source, v, defs);
1459 }
1460 }
1461 // Positional initializer: bare identifier directly inside an
1462 // `initializer_list`. We do NOT recurse from here to pick up
1463 // identifiers — the outer DFS already visits every node, and
1464 // emitting on direct identifier children of `initializer_list`
1465 // covers `{"name", funcname, 2}` patterns. Nested
1466 // `initializer_list` children (e.g. `{{...},{...}}`) are
1467 // popped onto the stack and processed in their own iteration.
1468 "initializer_list" => {
1469 let mut c = node.walk();
1470 for child in node.children(&mut c) {
1471 if child.kind() == "identifier" {
1472 emit_c_init_edge(source, child, defs);
1473 }
1474 }
1475 }
1476 _ => {}
1477 }
1478 // Recurse into named children. The DFS visits the entire subtree
1479 // so nested `initializer_pair` / `initializer_list` nodes are
1480 // reached without special handling.
1481 let mut cursor = node.walk();
1482 for child in node.children(&mut cursor) {
1483 if child.is_named() {
1484 stack.push(child);
1485 }
1486 }
1487 }
1488}
1489
1490/// Emit one synthetic CallRef edge for a C/C++ struct-literal fnptr
1491/// reference. `ident_node` must be an `identifier` node; its text becomes
1492/// the callee name. The edge is attached to the smallest enclosing
1493/// definition (typically the surrounding `declaration` def for the table
1494/// variable itself).
1495fn emit_c_init_edge(source: &str, ident_node: tree_sitter::Node<'_>, defs: &mut [Definition]) {
1496 let name = source[ident_node.start_byte()..ident_node.end_byte()].to_string();
1497 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
1498 let call_byte = ident_node.start_byte() as u32;
1499
1500 let enclosing_idx = defs
1501 .iter()
1502 .enumerate()
1503 .filter(|(_, d)| d.start_byte <= call_byte && call_byte < d.end_byte)
1504 .min_by_key(|(_, d)| (d.end_byte - d.start_byte, is_callable_def_priority(&d.kind)))
1505 .map(|(i, _)| i);
1506
1507 let Some(idx) = enclosing_idx else {
1508 return;
1509 };
1510 // Skip self-recursive emission (would happen if the enclosing def is
1511 // itself the referenced function — e.g. a static initializer inside a
1512 // function body whose .field designates the same function).
1513 if defs[idx].name == name {
1514 return;
1515 }
1516 // Skip duplicate edges (same caller, same name, same byte offset) —
1517 // belt-and-braces against the DFS visiting an identifier through both
1518 // its parent initializer_pair and a containing initializer_list scan.
1519 if defs[idx]
1520 .calls
1521 .iter()
1522 .any(|c| c.byte_offset == call_byte && c.name == name)
1523 {
1524 return;
1525 }
1526 defs[idx].calls.push(CallRef {
1527 name,
1528 qualified_path: None,
1529 receiver_type: None,
1530 byte_offset: call_byte,
1531 resolved: None,
1532 });
1533}
1534
1535/// Infer method-call receiver types from local context within a parse tree.
1536///
1537/// Returns a map from `byte_offset_of_@callee_capture` to a receiver type string.
1538///
1539/// Dispatches to a language-specific collector:
1540///
1541/// - **Rust**: [`collect_rust_receiver_types`] — three heuristic cases:
1542/// 1. `self.method()` inside `impl Foo { … }` → `"Foo"`.
1543/// 2. `x.method()` where `x: Bar` is a function parameter → `"Bar"`.
1544/// 3. `x.method()` after `let x = Foo::new()` → `"Foo"`.
1545///
1546/// - **Python**: [`collect_python_receiver_types`] — two heuristic cases:
1547/// 1. `self.method()` inside a class method → class name from enclosing
1548/// `class_definition`.
1549/// 2. `instance.method()` where `instance: ClassName` type annotation or
1550/// `instance = ClassName(...)` assignment is visible in the same scope.
1551///
1552/// - **Go**: [`collect_go_receiver_types`] — one heuristic case:
1553/// 1. `recv.Method()` inside a `method_declaration` where `recv` is the
1554/// named receiver parameter → receiver type from the method signature.
1555///
1556/// This is heuristic, not type-inference-complete. Unknown/ambiguous cases
1557/// produce no entry in the map; `extract_calls` leaves those `receiver_type = None`.
1558fn infer_receiver_types(
1559 source: &str,
1560 tree: &tree_sitter::Tree,
1561 language: &tree_sitter::Language,
1562) -> HashMap<u32, String> {
1563 let mut map: HashMap<u32, String> = HashMap::new();
1564
1565 if languages::is_rust_language(language) {
1566 collect_rust_receiver_types(source, tree.root_node(), &mut map);
1567 } else if languages::is_python_language(language) {
1568 collect_python_receiver_types(source, tree.root_node(), &mut map);
1569 } else if languages::is_go_language(language) {
1570 collect_go_receiver_types(source, tree.root_node(), &mut map);
1571 }
1572 // Other languages: no receiver inference — leave map empty.
1573
1574 map
1575}
1576
1577/// Walk the Rust parse tree and fill `map` with receiver-type inference.
1578///
1579/// This is a recursive descent that tracks:
1580/// - The current `impl Foo` or `impl Foo for Bar` type name (for `self.*` calls).
1581/// - Parameter type annotations (for `x: SomeType` → `x` has type `SomeType`).
1582/// - Constructor let-bindings (`let x = Foo::new()` → `x` has type `Foo`).
1583fn collect_rust_receiver_types(
1584 source: &str,
1585 node: tree_sitter::Node<'_>,
1586 map: &mut HashMap<u32, String>,
1587) {
1588 // We use a stack-based walk to avoid deep recursion on large files.
1589 // Each stack entry carries (node, impl_type_context).
1590 let mut stack: Vec<(tree_sitter::Node<'_>, Option<String>)> = vec![(node, None)];
1591
1592 while let Some((n, impl_ctx)) = stack.pop() {
1593 match n.kind() {
1594 "impl_item" => {
1595 // Extract `impl Foo` or `impl Trait for Foo` → capture the `for` type.
1596 // tree-sitter-rust shape: `(impl_item type: (type_identifier) @type)` for
1597 // inherent impls, and `(impl_item trait: … type: (type_identifier) @type)`
1598 // for trait impls. Both have a child named `type`.
1599 let impl_type = extract_impl_self_type(source, n);
1600 let new_ctx = impl_type.or_else(|| impl_ctx.clone());
1601 let mut cursor = n.walk();
1602 for child in n.children(&mut cursor) {
1603 stack.push((child, new_ctx.clone()));
1604 }
1605 }
1606 "function_item" => {
1607 // Build parameter bindings: (param_name → type_name).
1608 let param_types = extract_param_types(source, n);
1609 // Build let-binding type map from constructor calls.
1610 let let_types = extract_let_binding_types(source, n);
1611 // Annotate call sites within this function body.
1612 annotate_method_calls(
1613 source,
1614 n,
1615 impl_ctx.as_deref(),
1616 ¶m_types,
1617 &let_types,
1618 map,
1619 );
1620 // Do NOT recurse into function_item children with the outer stack —
1621 // function bodies are fully handled by annotate_method_calls.
1622 // (Nested fn items would re-enter via their own impl_item context.)
1623 // Push children with same impl_ctx so nested impl blocks are found.
1624 let mut cursor = n.walk();
1625 for child in n.children(&mut cursor) {
1626 stack.push((child, impl_ctx.clone()));
1627 }
1628 }
1629 _ => {
1630 let mut cursor = n.walk();
1631 for child in n.children(&mut cursor) {
1632 stack.push((child, impl_ctx.clone()));
1633 }
1634 }
1635 }
1636 }
1637}
1638
1639/// Extract the self type from an `impl_item` node.
1640///
1641/// For `impl Foo { … }` → `Some("Foo")`.
1642/// For `impl Trait for Foo { … }` → `Some("Foo")` (the concrete `for` type).
1643fn extract_impl_self_type(source: &str, impl_node: tree_sitter::Node<'_>) -> Option<String> {
1644 // tree-sitter-rust: impl_item has a field named "type" for the self type.
1645 // For `impl Foo for Bar { }`, "type" is Bar; for `impl Foo { }`, "type" is Foo.
1646 let type_node = impl_node.child_by_field_name("type")?;
1647 Some(source[type_node.start_byte()..type_node.end_byte()].to_string())
1648}
1649
1650/// Extract parameter name → type mappings from a function signature.
1651///
1652/// Handles `fn foo(x: Bar, y: Baz)` → `{"x": "Bar", "y": "Baz"}`.
1653/// The `self`/`&self`/`&mut self` parameter is skipped (handled via impl_ctx).
1654fn extract_param_types(source: &str, fn_node: tree_sitter::Node<'_>) -> HashMap<String, String> {
1655 let mut params: HashMap<String, String> = HashMap::new();
1656 let Some(params_node) = fn_node.child_by_field_name("parameters") else {
1657 return params;
1658 };
1659 let mut cursor = params_node.walk();
1660 for param in params_node.children(&mut cursor) {
1661 if param.kind() == "parameter" {
1662 // parameter has children: pattern (identifier) and type
1663 let mut param_name = None;
1664 let mut param_type = None;
1665 let mut pc = param.walk();
1666 for child in param.children(&mut pc) {
1667 match child.kind() {
1668 "identifier" | "mutable_specifier" if param_name.is_none() => {
1669 let text = source[child.start_byte()..child.end_byte()].to_string();
1670 if text != "mut" {
1671 param_name = Some(text);
1672 }
1673 }
1674 "type_identifier"
1675 | "generic_type"
1676 | "reference_type"
1677 | "scoped_type_identifier"
1678 if param_type.is_none() =>
1679 {
1680 // Extract the base type identifier from potentially complex types.
1681 param_type = Some(extract_base_type(source, child));
1682 }
1683 _ => {}
1684 }
1685 }
1686 if let (Some(name), Some(ty)) = (param_name, param_type)
1687 && !ty.is_empty()
1688 {
1689 params.insert(name, ty);
1690 }
1691 }
1692 // Also handle typed_pattern in newer grammars
1693 if param.kind() == "typed_pattern" {
1694 let mut name_part = None;
1695 let mut type_part = None;
1696 let mut pc = param.walk();
1697 for child in param.children(&mut pc) {
1698 if child.kind() == "identifier" && name_part.is_none() {
1699 name_part = Some(source[child.start_byte()..child.end_byte()].to_string());
1700 } else if matches!(
1701 child.kind(),
1702 "type_identifier"
1703 | "generic_type"
1704 | "reference_type"
1705 | "scoped_type_identifier"
1706 ) && type_part.is_none()
1707 {
1708 type_part = Some(extract_base_type(source, child));
1709 }
1710 }
1711 if let (Some(name), Some(ty)) = (name_part, type_part)
1712 && !ty.is_empty()
1713 {
1714 params.insert(name, ty);
1715 }
1716 }
1717 }
1718 params
1719}
1720
1721/// Extract the base `TypeIdentifier` from a potentially complex type node.
1722///
1723/// For `Bar`, `&Bar`, `&mut Bar`, `Bar<T>` → returns `"Bar"`.
1724/// For `module::Bar` → returns `"Bar"` (bare name for matching).
1725fn extract_base_type(source: &str, node: tree_sitter::Node<'_>) -> String {
1726 match node.kind() {
1727 "type_identifier" => source[node.start_byte()..node.end_byte()].to_string(),
1728 "generic_type" | "reference_type" | "mutable_specifier" | "scoped_type_identifier" => {
1729 // Recurse to find the innermost type_identifier
1730 let mut cursor = node.walk();
1731 for child in node.children(&mut cursor) {
1732 let t = extract_base_type(source, child);
1733 if !t.is_empty() {
1734 return t;
1735 }
1736 }
1737 String::new()
1738 }
1739 _ => {
1740 // For other nodes, try children
1741 let mut cursor = node.walk();
1742 for child in node.children(&mut cursor) {
1743 if child.kind() == "type_identifier" {
1744 return source[child.start_byte()..child.end_byte()].to_string();
1745 }
1746 }
1747 String::new()
1748 }
1749 }
1750}
1751
1752/// Scan a function body for `let x = Foo::new()` patterns.
1753///
1754/// Returns a map from local variable name to the constructor type name.
1755/// E.g., `let x = Foo::new();` → `{"x": "Foo"}`.
1756fn extract_let_binding_types(
1757 source: &str,
1758 fn_node: tree_sitter::Node<'_>,
1759) -> HashMap<String, String> {
1760 let mut bindings: HashMap<String, String> = HashMap::new();
1761
1762 let Some(body) = fn_node.child_by_field_name("body") else {
1763 return bindings;
1764 };
1765
1766 // Walk the function body looking for let_declaration nodes.
1767 let mut stack = vec![body];
1768 while let Some(n) = stack.pop() {
1769 if n.kind() == "let_declaration" {
1770 // let_declaration: pattern (identifier) + value (call_expression or …)
1771 let mut binding_name = None;
1772 let mut constructor_type = None;
1773 let mut cursor = n.walk();
1774 for child in n.children(&mut cursor) {
1775 match child.kind() {
1776 "identifier" if binding_name.is_none() => {
1777 binding_name =
1778 Some(source[child.start_byte()..child.end_byte()].to_string());
1779 }
1780 "call_expression" => {
1781 // Look for `Foo::new()` or `Foo::from(…)` patterns.
1782 // The function child of call_expression is a scoped_identifier.
1783 if let Some(func) = child.child_by_field_name("function")
1784 && func.kind() == "scoped_identifier"
1785 {
1786 // scoped_identifier path: `Foo::new` — extract head segment.
1787 let full = source[func.start_byte()..func.end_byte()].to_string();
1788 let head = full.split("::").next().unwrap_or("").to_string();
1789 if !head.is_empty()
1790 && head.chars().next().is_some_and(char::is_uppercase)
1791 {
1792 constructor_type = Some(head);
1793 }
1794 }
1795 }
1796 _ => {}
1797 }
1798 }
1799 if let (Some(name), Some(ty)) = (binding_name, constructor_type) {
1800 bindings.insert(name, ty);
1801 }
1802 }
1803 // Push children for recursive walk.
1804 let mut cursor = n.walk();
1805 for child in n.children(&mut cursor) {
1806 stack.push(child);
1807 }
1808 }
1809
1810 bindings
1811}
1812
1813/// Walk a function body and annotate method-call byte offsets with receiver types.
1814///
1815/// A "method call" in the ripvec call query is:
1816/// `(call_expression function: (field_expression field: (field_identifier) @callee))`
1817///
1818/// The receiver is the `value` child of `field_expression`. This function
1819/// checks whether the receiver is:
1820/// - `self` → use `impl_ctx` type.
1821/// - An identifier matching a parameter type in `param_types`.
1822/// - An identifier matching a constructor let-binding in `let_types`.
1823fn annotate_method_calls(
1824 source: &str,
1825 fn_node: tree_sitter::Node<'_>,
1826 impl_ctx: Option<&str>,
1827 param_types: &HashMap<String, String>,
1828 let_types: &HashMap<String, String>,
1829 map: &mut HashMap<u32, String>,
1830) {
1831 // Walk the entire function (including its body) looking for call_expression nodes.
1832 let mut stack = vec![fn_node];
1833 while let Some(n) = stack.pop() {
1834 if n.kind() == "call_expression"
1835 && let Some(func) = n.child_by_field_name("function")
1836 && func.kind() == "field_expression"
1837 {
1838 // field_expression: value (receiver) + field (method name identifier)
1839 if let (Some(recv), Some(field)) = (
1840 func.child_by_field_name("value"),
1841 func.child_by_field_name("field"),
1842 ) {
1843 let recv_text = source[recv.start_byte()..recv.end_byte()].to_string();
1844 let receiver_type = if recv_text == "self" || recv_text == "*self" {
1845 impl_ctx.map(str::to_owned)
1846 } else {
1847 // Strip ref sigils for lookup.
1848 let base = recv_text
1849 .trim_start_matches('*')
1850 .trim_start_matches('&')
1851 .trim();
1852 param_types
1853 .get(base)
1854 .or_else(|| let_types.get(base))
1855 .cloned()
1856 };
1857
1858 if let Some(ty) = receiver_type {
1859 // The `@callee` capture byte offset is the start of the field node.
1860 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
1861 let field_byte = field.start_byte() as u32;
1862 map.insert(field_byte, ty);
1863 }
1864 }
1865 }
1866 let mut cursor = n.walk();
1867 for child in n.children(&mut cursor) {
1868 stack.push(child);
1869 }
1870 }
1871}
1872
1873// ── Python receiver-type heuristic ───────────────────────────────────
1874
1875/// Walk the Python parse tree and fill `map` with receiver-type inference.
1876///
1877/// Two heuristic cases:
1878///
1879/// 1. **`self.method()` inside a method** — when the `attribute` call receiver is
1880/// the literal text `self`, the receiver type is the name of the nearest
1881/// enclosing `class_definition`.
1882///
1883/// 2. **`instance.method()` with a type annotation or constructor call** —
1884/// when a function parameter has a PEP 484 annotation `param: ClassName` or
1885/// when a local assignment `param = ClassName(...)` precedes the call, the
1886/// receiver type is bound to `ClassName`.
1887///
1888/// The Python call query captures:
1889/// - `(call function: (attribute attribute: (identifier) @callee)) @call`
1890///
1891/// Within the `attribute` node, `value` is the receiver expression and
1892/// `attribute` is the method name (the `@callee` capture). The `@callee`
1893/// byte offset is the start of the `attribute` child identifier node.
1894fn collect_python_receiver_types(
1895 source: &str,
1896 root: tree_sitter::Node<'_>,
1897 map: &mut HashMap<u32, String>,
1898) {
1899 // Stack carries (node, class_ctx: Option<String>).
1900 // class_ctx is the name of the nearest enclosing class_definition.
1901 let mut stack: Vec<(tree_sitter::Node<'_>, Option<String>)> = vec![(root, None)];
1902
1903 while let Some((n, class_ctx)) = stack.pop() {
1904 match n.kind() {
1905 "class_definition" => {
1906 // Extract the class name from the `name` child.
1907 let class_name = n
1908 .child_by_field_name("name")
1909 .map(|c| source[c.start_byte()..c.end_byte()].to_string());
1910 let new_ctx = class_name.or_else(|| class_ctx.clone());
1911 let mut cursor = n.walk();
1912 for child in n.children(&mut cursor) {
1913 stack.push((child, new_ctx.clone()));
1914 }
1915 }
1916 "function_definition" => {
1917 // Build parameter annotation map: param_name → type_name.
1918 let param_types = extract_python_param_types(source, n);
1919 // Build local assignment map: var_name → constructor type.
1920 let let_types = extract_python_assignment_types(source, n);
1921 // Annotate attribute call sites within this function body.
1922 annotate_python_method_calls(
1923 source,
1924 n,
1925 class_ctx.as_deref(),
1926 ¶m_types,
1927 &let_types,
1928 map,
1929 );
1930 // Push children with same class_ctx so nested classes are found.
1931 let mut cursor = n.walk();
1932 for child in n.children(&mut cursor) {
1933 stack.push((child, class_ctx.clone()));
1934 }
1935 }
1936 _ => {
1937 let mut cursor = n.walk();
1938 for child in n.children(&mut cursor) {
1939 stack.push((child, class_ctx.clone()));
1940 }
1941 }
1942 }
1943 }
1944}
1945
1946/// Extract Python parameter name → type annotation mappings.
1947///
1948/// Handles PEP 484 style: `def foo(self, x: Bar, y: Baz) -> ...`.
1949/// The `self` parameter is excluded (handled via class_ctx).
1950/// Returns `{"x": "Bar", "y": "Baz"}`.
1951fn extract_python_param_types(
1952 source: &str,
1953 fn_node: tree_sitter::Node<'_>,
1954) -> HashMap<String, String> {
1955 let mut params: HashMap<String, String> = HashMap::new();
1956 let Some(params_node) = fn_node.child_by_field_name("parameters") else {
1957 return params;
1958 };
1959
1960 // Parameters node children include `identifier`, `typed_parameter`,
1961 // `typed_default_parameter`, and others.
1962 let mut cursor = params_node.walk();
1963 for param in params_node.children(&mut cursor) {
1964 match param.kind() {
1965 "typed_parameter" => {
1966 // (typed_parameter (identifier) @name type: (type) @type)
1967 // First identifier child is the name; type child is the type.
1968 let mut name_text = None;
1969 let mut type_text = None;
1970 let mut pc = param.walk();
1971 for child in param.children(&mut pc) {
1972 match child.kind() {
1973 "identifier" if name_text.is_none() => {
1974 let t = source[child.start_byte()..child.end_byte()].to_string();
1975 if t != "self" && t != "cls" {
1976 name_text = Some(t);
1977 }
1978 }
1979 "type" | "identifier" | "attribute"
1980 if type_text.is_none() && name_text.is_some() =>
1981 {
1982 // The type child in tree-sitter-python is a `type` node
1983 // whose text is the annotation expression. Extract the
1984 // base identifier (handle `Optional[Bar]`, `List[Bar]`, etc.)
1985 type_text = Some(extract_python_base_type(source, child));
1986 }
1987 _ => {}
1988 }
1989 }
1990 if let (Some(name), Some(ty)) = (name_text, type_text)
1991 && !ty.is_empty()
1992 && !ty.eq("self")
1993 && !ty.eq("cls")
1994 {
1995 params.insert(name, ty);
1996 }
1997 }
1998 "typed_default_parameter" => {
1999 // (typed_default_parameter name: (identifier) type: (type) value: …)
2000 let name_node = param.child_by_field_name("name");
2001 let type_node = param.child_by_field_name("type");
2002 if let (Some(nn), Some(tn)) = (name_node, type_node) {
2003 let name = source[nn.start_byte()..nn.end_byte()].to_string();
2004 if name != "self" && name != "cls" {
2005 let ty = extract_python_base_type(source, tn);
2006 if !ty.is_empty() {
2007 params.insert(name, ty);
2008 }
2009 }
2010 }
2011 }
2012 _ => {}
2013 }
2014 }
2015 params
2016}
2017
2018/// Extract the base type name from a Python type annotation node.
2019///
2020/// For `Bar` → `"Bar"`. For `Optional[Bar]` or `List[Bar]` → `"Bar"`.
2021/// For `module.Class` → `"Class"` (bare name only).
2022fn extract_python_base_type(source: &str, node: tree_sitter::Node<'_>) -> String {
2023 match node.kind() {
2024 "identifier" => source[node.start_byte()..node.end_byte()].to_string(),
2025 // tree-sitter-python wraps annotations in a `type` node
2026 "type" => {
2027 let mut cursor = node.walk();
2028 for child in node.children(&mut cursor) {
2029 let t = extract_python_base_type(source, child);
2030 if !t.is_empty() {
2031 return t;
2032 }
2033 }
2034 String::new()
2035 }
2036 // Generic alias: `Optional[Bar]` — the first identifier child is `Optional`,
2037 // the subscript child contains `Bar`. We want the subscript content.
2038 "subscript" => {
2039 // subscript has value (e.g. Optional) and subscript (e.g. Bar).
2040 // Return the subscript's base type (the inner type argument).
2041 if let Some(sub) = node.child_by_field_name("subscript") {
2042 return extract_python_base_type(source, sub);
2043 }
2044 // Fall back: first identifier
2045 let mut cursor = node.walk();
2046 for child in node.children(&mut cursor) {
2047 if child.kind() == "identifier" {
2048 return source[child.start_byte()..child.end_byte()].to_string();
2049 }
2050 }
2051 String::new()
2052 }
2053 // Attribute node `module.Class` → take last identifier
2054 "attribute" => {
2055 if let Some(attr) = node.child_by_field_name("attribute") {
2056 return source[attr.start_byte()..attr.end_byte()].to_string();
2057 }
2058 String::new()
2059 }
2060 _ => {
2061 // Try first identifier child
2062 let mut cursor = node.walk();
2063 for child in node.children(&mut cursor) {
2064 if child.kind() == "identifier" {
2065 return source[child.start_byte()..child.end_byte()].to_string();
2066 }
2067 }
2068 String::new()
2069 }
2070 }
2071}
2072
2073/// Scan a Python function body for `x = ClassName(...)` assignment patterns.
2074///
2075/// Returns a map from local variable name to constructor type.
2076/// E.g., `x = Foo()` → `{"x": "Foo"}`.
2077/// Also handles `x = module.ClassName(...)` → `{"x": "ClassName"}`.
2078fn extract_python_assignment_types(
2079 source: &str,
2080 fn_node: tree_sitter::Node<'_>,
2081) -> HashMap<String, String> {
2082 let mut bindings: HashMap<String, String> = HashMap::new();
2083 let Some(body) = fn_node.child_by_field_name("body") else {
2084 return bindings;
2085 };
2086
2087 let mut stack = vec![body];
2088 while let Some(n) = stack.pop() {
2089 if n.kind() == "assignment" {
2090 // assignment: left = right
2091 // We want: left is a simple identifier, right is a call whose
2092 // function is an identifier starting with an uppercase letter
2093 // (Python convention for class names).
2094 let left = n.child_by_field_name("left");
2095 let right = n.child_by_field_name("right");
2096 if let (Some(lhs), Some(rhs)) = (left, right)
2097 && lhs.kind() == "identifier"
2098 && rhs.kind() == "call"
2099 && let Some(func) = rhs.child_by_field_name("function")
2100 {
2101 let var_name = source[lhs.start_byte()..lhs.end_byte()].to_string();
2102 let constructor_type = match func.kind() {
2103 "identifier" => {
2104 let t = source[func.start_byte()..func.end_byte()].to_string();
2105 // Class names are conventionally uppercase-first
2106 if t.chars().next().is_some_and(char::is_uppercase) {
2107 Some(t)
2108 } else {
2109 None
2110 }
2111 }
2112 "attribute" => {
2113 // `module.ClassName(...)` — take the `attribute` part
2114 func.child_by_field_name("attribute")
2115 .map(|a| source[a.start_byte()..a.end_byte()].to_string())
2116 }
2117 _ => None,
2118 };
2119 if let Some(ty) = constructor_type {
2120 bindings.insert(var_name, ty);
2121 }
2122 }
2123 }
2124 let mut cursor = n.walk();
2125 for child in n.children(&mut cursor) {
2126 stack.push(child);
2127 }
2128 }
2129 bindings
2130}
2131
2132/// Walk a Python function body and annotate attribute-call byte offsets.
2133///
2134/// A Python method call is:
2135/// `(call function: (attribute value: <receiver> attribute: (identifier) @callee))`
2136///
2137/// The receiver is the `value` child of `attribute`. This function checks:
2138/// - `self` → use the enclosing class name (`class_ctx`).
2139/// - An identifier matching a parameter type in `param_types`.
2140/// - An identifier matching a constructor assignment in `let_types`.
2141fn annotate_python_method_calls(
2142 source: &str,
2143 fn_node: tree_sitter::Node<'_>,
2144 class_ctx: Option<&str>,
2145 param_types: &HashMap<String, String>,
2146 let_types: &HashMap<String, String>,
2147 map: &mut HashMap<u32, String>,
2148) {
2149 let mut stack = vec![fn_node];
2150 while let Some(n) = stack.pop() {
2151 if n.kind() == "call"
2152 && let Some(func) = n.child_by_field_name("function")
2153 && func.kind() == "attribute"
2154 && let (Some(recv_node), Some(attr_node)) = (
2155 func.child_by_field_name("object"),
2156 func.child_by_field_name("attribute"),
2157 )
2158 {
2159 // attribute node: object (receiver) + attribute (method name)
2160 let recv_text = source[recv_node.start_byte()..recv_node.end_byte()].to_string();
2161 let receiver_type = if recv_text == "self" || recv_text == "cls" {
2162 class_ctx.map(str::to_owned)
2163 } else if recv_node.kind() == "identifier" {
2164 param_types
2165 .get(&recv_text)
2166 .or_else(|| let_types.get(&recv_text))
2167 .cloned()
2168 } else {
2169 None
2170 };
2171
2172 if let Some(ty) = receiver_type {
2173 // The `@callee` capture byte offset is the `attribute` child.
2174 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
2175 let attr_byte = attr_node.start_byte() as u32;
2176 map.insert(attr_byte, ty);
2177 }
2178 }
2179 let mut cursor = n.walk();
2180 for child in n.children(&mut cursor) {
2181 stack.push(child);
2182 }
2183 }
2184}
2185
2186// ── Python class hierarchy (MRO) extraction ───────────────────────────
2187
2188/// Walk the Python parse tree and add class → parent-names entries to `out`.
2189///
2190/// tree-sitter-python shape:
2191/// ```text
2192/// (class_definition
2193/// name: (identifier) @child
2194/// superclasses: (argument_list
2195/// (identifier) @parent ; bare parent: class Foo(Bar):
2196/// (attribute attribute: (identifier) @parent) ; qualified: class Foo(mod.Bar):
2197/// (keyword_argument …) ; ignored: class Foo(Bar, metaclass=Meta):
2198/// )?
2199/// ...)
2200/// ```
2201///
2202/// Each map entry's key is a class defined in the file; the value is the
2203/// ordered list of declared parent class **names** (the trailing `attribute`
2204/// segment, so `mod.Bar` becomes `Bar`). Classes with no declared parents
2205/// still get an entry (empty `Vec`) so a downstream MRO walk can tell
2206/// "known class with no parents" from "unknown class".
2207fn extract_python_class_hierarchy_node(
2208 source: &str,
2209 root: tree_sitter::Node<'_>,
2210 out: &mut HashMap<String, Vec<String>>,
2211) {
2212 let mut stack = vec![root];
2213 while let Some(n) = stack.pop() {
2214 if n.kind() == "class_definition"
2215 && let Some(name_node) = n.child_by_field_name("name")
2216 {
2217 let class_name = source[name_node.start_byte()..name_node.end_byte()].to_string();
2218 let mut parents: Vec<String> = Vec::new();
2219 if let Some(superclasses) = n.child_by_field_name("superclasses") {
2220 // superclasses is an `argument_list`; iterate its children and
2221 // collect identifiers / attribute trailing-names. Skip
2222 // keyword_argument entries (metaclass=…, etc.) and punctuation.
2223 let mut sc = superclasses.walk();
2224 for child in superclasses.children(&mut sc) {
2225 match child.kind() {
2226 "identifier" => {
2227 let t = source[child.start_byte()..child.end_byte()].to_string();
2228 parents.push(t);
2229 }
2230 "attribute" => {
2231 // module.Cls → take the trailing `attribute` segment.
2232 if let Some(attr) = child.child_by_field_name("attribute") {
2233 parents
2234 .push(source[attr.start_byte()..attr.end_byte()].to_string());
2235 }
2236 }
2237 // Drop keyword_argument, "(", ")", ",", comments, etc.
2238 _ => {}
2239 }
2240 }
2241 }
2242 out.insert(class_name, parents);
2243 }
2244 let mut cursor = n.walk();
2245 for child in n.children(&mut cursor) {
2246 stack.push(child);
2247 }
2248 }
2249}
2250
2251/// Extract the Python `class → [parents]` map from a single source file by
2252/// parsing it with tree-sitter-python.
2253///
2254/// Returns an empty map when the source fails to parse or contains no
2255/// `class_definition` nodes. The returned map is the per-file contribution
2256/// to the global hierarchy used by [`resolve_calls_with_python_mro_pub`]
2257/// for MRO-aware receiver-type dispatch (Q1, Wave 2).
2258#[must_use]
2259pub fn extract_python_class_hierarchy(source: &str) -> HashMap<String, Vec<String>> {
2260 let mut parser = Parser::new();
2261 let lang: tree_sitter::Language = tree_sitter_python::LANGUAGE.into();
2262 if parser.set_language(&lang).is_err() {
2263 return HashMap::new();
2264 }
2265 let Some(tree) = parser.parse(source, None) else {
2266 return HashMap::new();
2267 };
2268 let mut out: HashMap<String, Vec<String>> = HashMap::new();
2269 extract_python_class_hierarchy_node(source, tree.root_node(), &mut out);
2270 out
2271}
2272
2273/// Compute the linearised MRO (Method Resolution Order) for a Python class
2274/// name using a **simplified left-first depth-first walk** of the declared
2275/// `class → [parents]` hierarchy.
2276///
2277/// Python's real MRO uses C3 linearisation, which is monotonic and respects
2278/// declaration order across the diamond inheritance shape. For ripvec's
2279/// reverse-call-graph purpose we want *any plausible ancestor* of the
2280/// receiver type — including ancestors only reachable via a mixin — so we
2281/// can resolve `self.method()` calls whose dispatch lands on an ancestor.
2282/// The simplification: pre-order DFS, left-to-right, skipping cycles via a
2283/// `visited` set.
2284///
2285/// On a non-diamond shape this matches C3 exactly. On a diamond the
2286/// simplified walk may surface an ancestor earlier than C3 would, but every
2287/// ancestor C3 would visit is still reached — and ripvec's goal is "find
2288/// the implementing def for an inherited call", not "compute the runtime
2289/// dispatch winner". Over-approximating ancestors only matters when two
2290/// ancestors define the same method, and even then the left-first order
2291/// matches C3 on the common patterns
2292/// (`class Sub(Base, Mixin)` → `Sub, Base, Mixin, <Base's ancestors>, <Mixin's ancestors>`).
2293///
2294/// The returned list excludes the start class itself. Each entry appears at
2295/// most once even when reachable through multiple parent chains.
2296fn compute_python_mro<H: std::hash::BuildHasher>(
2297 start: &str,
2298 hierarchy: &HashMap<String, Vec<String>, H>,
2299) -> Vec<String> {
2300 use std::collections::HashSet;
2301 let mut order: Vec<String> = Vec::new();
2302 let mut visited: HashSet<String> = HashSet::new();
2303 // Start with the immediate parents of `start` (the receiver-type's own
2304 // scope was already searched by Priority 2's direct match).
2305 let Some(start_parents) = hierarchy.get(start) else {
2306 return order;
2307 };
2308 // DFS stack: we push in reverse so pop yields left-first order.
2309 let mut stack: Vec<String> = start_parents.iter().rev().cloned().collect();
2310 while let Some(cls) = stack.pop() {
2311 if !visited.insert(cls.clone()) {
2312 continue;
2313 }
2314 order.push(cls.clone());
2315 if let Some(parents) = hierarchy.get(&cls) {
2316 for p in parents.iter().rev() {
2317 if !visited.contains(p) {
2318 stack.push(p.clone());
2319 }
2320 }
2321 }
2322 }
2323 order
2324}
2325
2326// ── Go receiver-type heuristic ────────────────────────────────────────
2327
2328/// Walk the Go parse tree and fill `map` with receiver-type inference.
2329///
2330/// One heuristic case: **`recv.Method()` inside a `method_declaration`**.
2331///
2332/// Go methods have an explicit receiver parameter in their signature:
2333/// `func (r *Foo) Bar() { r.Baz() }` — `r` is bound to type `Foo`.
2334///
2335/// The Go call query captures:
2336/// `(call_expression function: (selector_expression field: (field_identifier) @callee))`
2337///
2338/// Within `selector_expression`, `operand` is the receiver expression and
2339/// `field` is the method name (the `@callee` capture).
2340///
2341/// This function also handles `self.method()` patterns for cases where code
2342/// uses `self` as a receiver name (not idiomatic Go, but it occurs).
2343fn collect_go_receiver_types(
2344 source: &str,
2345 root: tree_sitter::Node<'_>,
2346 map: &mut HashMap<u32, String>,
2347) {
2348 // Stack carries (node, receiver_binding: Option<(recv_name, recv_type)>).
2349 let mut stack: Vec<(tree_sitter::Node<'_>, Option<(String, String)>)> = vec![(root, None)];
2350
2351 while let Some((n, recv_binding)) = stack.pop() {
2352 if n.kind() == "method_declaration" {
2353 // Extract the receiver name and type from the method signature.
2354 let binding = extract_go_receiver_binding(source, n);
2355 let new_binding = binding.or_else(|| recv_binding.clone());
2356 let mut cursor = n.walk();
2357 for child in n.children(&mut cursor) {
2358 stack.push((child, new_binding.clone()));
2359 }
2360 } else {
2361 // For any call_expression whose function is a selector_expression,
2362 // check if the operand matches the active receiver binding.
2363 if n.kind() == "call_expression"
2364 && let Some(func) = n.child_by_field_name("function")
2365 && func.kind() == "selector_expression"
2366 && let (Some(operand), Some(field)) = (
2367 func.child_by_field_name("operand"),
2368 func.child_by_field_name("field"),
2369 )
2370 {
2371 let recv_text = source[operand.start_byte()..operand.end_byte()].to_string();
2372 let receiver_type = recv_binding.as_ref().and_then(|(recv_name, recv_ty)| {
2373 if recv_text == *recv_name {
2374 Some(recv_ty.clone())
2375 } else {
2376 None
2377 }
2378 });
2379
2380 if let Some(ty) = receiver_type {
2381 // The `@callee` capture byte offset is the `field` child.
2382 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
2383 let field_byte = field.start_byte() as u32;
2384 map.insert(field_byte, ty);
2385 }
2386 }
2387
2388 let mut cursor = n.walk();
2389 for child in n.children(&mut cursor) {
2390 stack.push((child, recv_binding.clone()));
2391 }
2392 }
2393 }
2394}
2395
2396/// Extract the receiver name and base type from a Go `method_declaration`.
2397///
2398/// Go method declaration shape (tree-sitter-go):
2399/// ```text
2400/// (method_declaration
2401/// receiver: (parameter_list
2402/// (parameter_declaration
2403/// name: (identifier) ← receiver name
2404/// type: (type_identifier ← receiver type (bare)
2405/// | pointer_type (type_identifier)) ← or *Type
2406/// )
2407/// )
2408/// name: (field_identifier)
2409/// ...
2410/// )
2411/// ```
2412///
2413/// Returns `Some((recv_name, type_name))` or `None` if the receiver is unnamed
2414/// (blank identifier `_`) or has an unrecognisable shape.
2415fn extract_go_receiver_binding(
2416 source: &str,
2417 method_node: tree_sitter::Node<'_>,
2418) -> Option<(String, String)> {
2419 let receiver_list = method_node.child_by_field_name("receiver")?;
2420 // parameter_list contains one parameter_declaration
2421 let mut cursor = receiver_list.walk();
2422 for param in receiver_list.children(&mut cursor) {
2423 if param.kind() == "parameter_declaration" {
2424 let name_node = param.child_by_field_name("name");
2425 let type_node = param.child_by_field_name("type");
2426 if let (Some(nn), Some(tn)) = (name_node, type_node) {
2427 let name = source[nn.start_byte()..nn.end_byte()].to_string();
2428 if name == "_" || name.is_empty() {
2429 return None;
2430 }
2431 let ty = extract_go_base_type(source, tn);
2432 if !ty.is_empty() {
2433 return Some((name, ty));
2434 }
2435 }
2436 }
2437 }
2438 None
2439}
2440
2441/// Extract the base type name from a Go type node.
2442///
2443/// For `Foo` (type_identifier) → `"Foo"`.
2444/// For `*Foo` (pointer_type → type_identifier) → `"Foo"`.
2445fn extract_go_base_type(source: &str, node: tree_sitter::Node<'_>) -> String {
2446 match node.kind() {
2447 "type_identifier" => source[node.start_byte()..node.end_byte()].to_string(),
2448 "pointer_type" => {
2449 // pointer_type has one child: the pointee type
2450 let mut cursor = node.walk();
2451 for child in node.children(&mut cursor) {
2452 if child.kind() == "type_identifier" {
2453 return source[child.start_byte()..child.end_byte()].to_string();
2454 }
2455 let t = extract_go_base_type(source, child);
2456 if !t.is_empty() {
2457 return t;
2458 }
2459 }
2460 String::new()
2461 }
2462 _ => {
2463 let mut cursor = node.walk();
2464 for child in node.children(&mut cursor) {
2465 if child.kind() == "type_identifier" {
2466 return source[child.start_byte()..child.end_byte()].to_string();
2467 }
2468 }
2469 String::new()
2470 }
2471 }
2472}
2473
2474/// Enrich Go `method_declaration` definition scopes with their receiver type name.
2475///
2476/// In the generic `extract_definitions` path, `build_scope_chain` walks the
2477/// *parent* chain of the `@def` node. For Go `method_declaration`, the parent
2478/// is the file root — so the scope is always `""`.
2479///
2480/// An empty scope means `resolve_calls` Priority 2 (receiver-type matching via
2481/// `scope.contains(recv_type)`) never fires for Go methods. Cross-file calls
2482/// where the caller inferred `receiver_type = Some("Foo")` stay unresolved;
2483/// no edge is recorded; `def_callers[]` stays empty for those defs — the root
2484/// cause of the missing inverse index for Go (I#P1).
2485///
2486/// Fix: after `extract_definitions`, parse the Go source a second time to find
2487/// each `method_declaration`'s receiver type, then set the matching def's scope
2488/// to `"method_declaration {ReceiverType}"`. This matches the pattern used by
2489/// the existing `go_resolve_receiver_method_via_signature` integration test,
2490/// which asserts that `scope.contains("Foo")` succeeds when the scope is
2491/// `"method_declaration Foo"`.
2492///
2493/// Matching is by `start_byte` (precise) so name collisions across different
2494/// receiver types are handled correctly.
2495fn enrich_go_method_def_scopes(source: &str, defs: &mut [Definition]) {
2496 let go_lang: tree_sitter::Language = tree_sitter_go::LANGUAGE.into();
2497 let mut parser = Parser::new();
2498 if parser.set_language(&go_lang).is_err() {
2499 return;
2500 }
2501 let Some(tree) = parser.parse(source, None) else {
2502 return;
2503 };
2504
2505 // Walk all top-level method_declaration nodes.
2506 let root = tree.root_node();
2507 let mut method_cursor = root.walk();
2508 for child in root.children(&mut method_cursor) {
2509 if child.kind() != "method_declaration" {
2510 continue;
2511 }
2512 let Some((_, recv_type)) = extract_go_receiver_binding(source, child) else {
2513 continue;
2514 };
2515 // Match by start_byte (precise): the @def node for method_declaration in
2516 // the Go definition query is the method_declaration node itself, so its
2517 // start_byte matches the def's start_byte recorded during extract_definitions.
2518 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
2519 let method_start_byte = child.start_byte() as u32;
2520 for def in defs.iter_mut() {
2521 if def.kind == "method_declaration" && def.start_byte == method_start_byte {
2522 def.scope = format!("method_declaration {recv_type}");
2523 break;
2524 }
2525 }
2526 }
2527}
2528
2529/// Public wrapper for `enrich_go_method_def_scopes` — enables integration tests
2530/// to call it directly without going through the full `build_graph` pipeline.
2531pub fn enrich_go_method_def_scopes_pub(source: &str, defs: &mut [Definition]) {
2532 enrich_go_method_def_scopes(source, defs);
2533}
2534
2535/// SQL: prepend a synthetic whole-file definition whose name is the filename stem.
2536///
2537/// dbt and sqlmesh follow a filename-as-model-name convention:
2538///
2539/// - `silver_issuer_returns.sql` defines the `silver_issuer_returns` model.
2540/// - `gold_issuer_returns.sql` references the silver model by filename stem,
2541/// not by any in-source CREATE TABLE.
2542///
2543/// In sqlmesh, the in-source name is templated:
2544///
2545/// ```sql
2546/// MODEL (
2547/// name @{athena_sqlmesh_silver_schema}.issuer_returns,
2548/// ...
2549/// );
2550/// SELECT ... FROM @{athena_sqlmesh_silver_schema}.stg_issuer_returns;
2551/// ```
2552///
2553/// The `MODEL (...)` header parses as an ERROR node under tree-sitter-sequel
2554/// because `@{var}` is not standard SQL; FROM/JOIN further down the file still
2555/// extract cleanly. Without a synthetic def, `lsp_workspace_symbols(query=
2556/// "silver_issuer_returns")` returns no hits — there is no real CREATE TABLE
2557/// in the file and the model name is interpolation only.
2558///
2559/// This helper prepends a definition with:
2560/// - `name` = filename stem (e.g., `silver_issuer_returns`)
2561/// - `kind` = `"sql_file"` (maps to `LSP SymbolKind::File` in
2562/// [`languages::lsp_symbol_kind_for_node_kind`])
2563/// - byte range = the entire source `[0, source.len())`
2564/// - scope / signature / qualified_name = empty / None
2565///
2566/// The whole-file byte range is the key to FROM/JOIN attribution: when
2567/// `extract_calls` later places a CallRef from a FROM clause that is not
2568/// inside any CTE or other smaller def, the smallest-enclosing-def search
2569/// lands on this synthetic file def and the edge is recorded.
2570///
2571/// If the filename has no stem (empty / `..`), the helper is a no-op.
2572/// Idempotent: if a `sql_file` def already exists at byte 0, it is left alone.
2573pub(crate) fn enrich_sql_file_def(filename: &str, source: &str, defs: &mut Vec<Definition>) {
2574 // Idempotency: do nothing if a sql_file def is already present at byte 0.
2575 if defs
2576 .iter()
2577 .any(|d| d.kind == "sql_file" && d.start_byte == 0)
2578 {
2579 return;
2580 }
2581
2582 // Derive the filename stem (last path component, file extension stripped).
2583 let stem = std::path::Path::new(filename)
2584 .file_stem()
2585 .and_then(|s| s.to_str())
2586 .unwrap_or_default();
2587 if stem.is_empty() {
2588 return;
2589 }
2590
2591 // Count newlines so end_line is reasonable for downstream UI.
2592 let end_line_zero_based = source.bytes().filter(|&b| b == b'\n').count();
2593 #[expect(clippy::cast_possible_truncation, reason = "line counts fit in u32")]
2594 let end_line = (end_line_zero_based as u32) + 1;
2595 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
2596 let end_byte = source.len() as u32;
2597
2598 let file_def = Definition {
2599 name: stem.to_string(),
2600 kind: "sql_file".to_string(),
2601 start_line: 1,
2602 end_line,
2603 scope: String::new(),
2604 signature: None,
2605 start_byte: 0,
2606 end_byte,
2607 calls: vec![],
2608 decorator: None,
2609 lsp_kind_hint: None,
2610 };
2611 // Prepend so it remains the outermost (largest) enclosing def at byte 0,
2612 // ensuring narrow CTE defs are still preferred for inner-FROM attribution
2613 // (the smallest-enclosing rule in `extract_calls`).
2614 defs.insert(0, file_def);
2615}
2616
2617/// Public wrapper for [`enrich_sql_file_def`] — enables integration tests
2618/// to call it directly without going through the full `build_graph` pipeline.
2619///
2620/// This is `pub` (not `pub(crate)`) because integration tests in
2621/// `crates/ripvec-core/tests/` are in a separate crate and cannot access
2622/// `pub(crate)` items.
2623pub fn enrich_sql_file_def_pub(filename: &str, source: &str, defs: &mut Vec<Definition>) {
2624 enrich_sql_file_def(filename, source, defs);
2625}
2626
2627/// Public wrapper for `extract_calls` — enables integration tests to call it
2628/// directly without going through the full `build_graph` pipeline.
2629///
2630/// This is `pub` (not `pub(crate)`) because integration tests in
2631/// `crates/ripvec-core/tests/` are in a separate crate and cannot access
2632/// `pub(crate)` items.
2633pub fn extract_calls_pub(
2634 source: &str,
2635 call_config: &languages::CallConfig,
2636 defs: &mut [Definition],
2637) {
2638 extract_calls(source, call_config, defs);
2639}
2640
2641/// Public wrapper for [`extract_definitions`] — enables integration tests in
2642/// `crates/ripvec-core/tests/` to drive Python decorator detection (C1, 4.1.1).
2643///
2644/// Returns the [`Definition`] list for `source` parsed as the language
2645/// identified by `lang_config`.
2646#[must_use]
2647pub fn extract_definitions_pub(
2648 source: &str,
2649 lang_config: &languages::LangConfig,
2650) -> Vec<Definition> {
2651 extract_definitions(source, lang_config)
2652}
2653
2654/// Build an index from definition name to list of `DefId`s.
2655#[must_use]
2656pub fn build_def_index_pub(files: &[FileNode]) -> HashMap<String, Vec<DefId>> {
2657 build_def_index(files)
2658}
2659
2660fn build_def_index(files: &[FileNode]) -> HashMap<String, Vec<DefId>> {
2661 let mut index: HashMap<String, Vec<DefId>> = HashMap::new();
2662 for (file_idx, file) in files.iter().enumerate() {
2663 for (def_idx, def) in file.defs.iter().enumerate() {
2664 #[expect(clippy::cast_possible_truncation)]
2665 let did: DefId = (file_idx as u32, def_idx as u16);
2666 index.entry(def.name.clone()).or_default().push(did);
2667 }
2668 }
2669 index
2670}
2671
2672/// Resolve call references to target definitions.
2673///
2674/// Resolution priority:
2675///
2676/// 1. **Qualified path** (`qualified_path = Some("mod_a::foo")`): filter candidates
2677/// by qualifier match (file path or scope contains the qualifier segment). Unique
2678/// match → resolve; ambiguous or no match → leave `None`.
2679/// 2. **Receiver type — direct scope match** (`receiver_type = Some("Foo")`):
2680/// for method calls, prefer candidates whose `scope` contains the receiver
2681/// type name (e.g., `"impl_item Foo"`). Among receiver-matching candidates,
2682/// further prefer those in imported files. Unique match → resolve;
2683/// ambiguous → leave `None`.
2684/// When this step finds nothing on the receiver class itself, sub-step 2b
2685/// (Python MRO walk) runs: when `Foo` has a recorded parent chain, walk
2686/// the receiver class's MRO (left-first DFS) and try the scope-match
2687/// against each ancestor's name. First ancestor with a matching candidate
2688/// wins. See [`compute_python_mro`] for the simplification rationale (it
2689/// diverges from C3 only on diamond shapes where two ancestors define the
2690/// same name).
2691/// 3. **Same file** (unqualified, no receiver): prefer definitions in the caller's
2692/// own file.
2693/// 4. **SQL suffix-match** (sql_file callers only, no exact-name match): when
2694/// the caller def has `kind = "sql_file"` and the bare `call_name` (e.g.,
2695/// `"issuer_returns"`) has no exact entry in the def index, scan all
2696/// `sql_file` defs for names ending with `_<call_name>` (e.g.,
2697/// `"silver_issuer_returns"`). This bridges dbt / sqlmesh layered schema
2698/// prefixes: `gold_issuer_returns.sql` uses `FROM @{schema}.issuer_returns`
2699/// which tree-sitter reduces to the bare name `"issuer_returns"`, while the
2700/// target def is the synthetic `sql_file` def named `"silver_issuer_returns"`.
2701/// Unique suffix-match → resolve; ambiguous (multiple layers match) or
2702/// no match → leave `None`. Non-sql_file callers are explicitly excluded.
2703/// 5. **Imported file** (unqualified, no receiver): check definitions in files this
2704/// file imports. Unique imported candidate → resolve.
2705/// 6. **Global-unique fallback**: when a bare call name maps to exactly one def in the
2706/// entire graph — regardless of file or import relationship — resolve to it.
2707/// Handles trait-method dispatch (`Trait::method` called as bare `method`) and
2708/// struct constructors referenced across non-imported module boundaries.
2709/// Only fires when Priorities 1–5 left the call unresolved and exactly one
2710/// candidate exists. Ambiguous (>1 candidates) → leave `None`.
2711/// 7. **Ambiguous or unresolved**: leave `resolved` as `None` (no silent first-wins).
2712///
2713/// Equivalent to [`resolve_calls_with_python_mro_pub`] with an empty MRO map
2714/// (Priority 2.5 is a no-op).
2715pub fn resolve_calls_pub<S: std::hash::BuildHasher>(
2716 files: &mut [FileNode],
2717 def_index: &HashMap<String, Vec<DefId>, S>,
2718) {
2719 let empty: HashMap<String, Vec<String>> = HashMap::new();
2720 resolve_calls_inner(files, def_index, &empty);
2721}
2722
2723/// Resolve call references with MRO-aware Python receiver dispatch enabled.
2724///
2725/// Identical to [`resolve_calls_pub`] except that Priority 2.5 (the MRO walk)
2726/// fires when the caller passes a non-empty `python_class_hierarchy`.
2727/// `build_graph` populates the hierarchy by parsing every Python source file
2728/// with [`extract_python_class_hierarchy`] and merging the per-file maps.
2729///
2730/// Tests that want to exercise MRO resolution without going through
2731/// `build_graph` can call this directly with a synthetic hierarchy.
2732pub fn resolve_calls_with_python_mro_pub<S, H>(
2733 files: &mut [FileNode],
2734 def_index: &HashMap<String, Vec<DefId>, S>,
2735 python_class_hierarchy: &HashMap<String, Vec<String>, H>,
2736) where
2737 S: std::hash::BuildHasher,
2738 H: std::hash::BuildHasher,
2739{
2740 resolve_calls_inner(files, def_index, python_class_hierarchy);
2741}
2742
2743fn resolve_calls<S, H>(
2744 files: &mut [FileNode],
2745 def_index: &HashMap<String, Vec<DefId>, S>,
2746 python_class_hierarchy: &HashMap<String, Vec<String>, H>,
2747) where
2748 S: std::hash::BuildHasher,
2749 H: std::hash::BuildHasher,
2750{
2751 resolve_calls_inner(files, def_index, python_class_hierarchy);
2752}
2753
2754#[expect(
2755 clippy::too_many_lines,
2756 reason = "8-priority resolution cascade (qualified path, receiver type, MRO walk, same-file, \
2757 SQL suffix-match, imported-file, global-unique, ambiguous); each priority is a \
2758 distinct decision branch and extracting helpers would require passing large shared \
2759 state across boundaries"
2760)]
2761fn resolve_calls_inner<S, H>(
2762 files: &mut [FileNode],
2763 def_index: &HashMap<String, Vec<DefId>, S>,
2764 python_class_hierarchy: &HashMap<String, Vec<String>, H>,
2765) where
2766 S: std::hash::BuildHasher,
2767 H: std::hash::BuildHasher,
2768{
2769 // Pre-compute imported file sets for each file.
2770 let imported_files: Vec<std::collections::HashSet<u32>> = files
2771 .iter()
2772 .map(|f| {
2773 f.imports
2774 .iter()
2775 .filter_map(|imp| imp.resolved_idx)
2776 .collect()
2777 })
2778 .collect();
2779
2780 for file_idx in 0..files.len() {
2781 for def_idx in 0..files[file_idx].defs.len() {
2782 for call_idx in 0..files[file_idx].defs[def_idx].calls.len() {
2783 let call_name = files[file_idx].defs[def_idx].calls[call_idx].name.clone();
2784 let qualified_path = files[file_idx].defs[def_idx].calls[call_idx]
2785 .qualified_path
2786 .clone();
2787 let receiver_type = files[file_idx].defs[def_idx].calls[call_idx]
2788 .receiver_type
2789 .clone();
2790
2791 // HCL: dedicated resolution for `terraform_remote_state.<NAME>`
2792 // and `module.<NAME>` qualified paths. Aurora's module DAG is
2793 // expressed by these patterns; resolve to the first def in any
2794 // file under a `/<NAME>/` directory segment. This is the
2795 // module-source contract (R2 + R3, Wave 3).
2796 if let Some(ref qpath) = qualified_path
2797 && (qpath.starts_with("terraform_remote_state.")
2798 || qpath.starts_with("module."))
2799 {
2800 let target = &call_name; // already the bare module label
2801 let segment_match = format!("/{target}/");
2802 let alt_segment_prefix = format!("{target}/"); // when path starts with target dir
2803 let candidate = files.iter().enumerate().find_map(|(idx, f)| {
2804 if f.path.contains(&segment_match)
2805 || f.path.starts_with(&alt_segment_prefix)
2806 {
2807 // Pick the first def in the file (or skip if file
2808 // has no defs).
2809 if !f.defs.is_empty() {
2810 #[expect(
2811 clippy::cast_possible_truncation,
2812 reason = "file index fits in u32"
2813 )]
2814 {
2815 return Some((idx as u32, 0u16));
2816 }
2817 }
2818 }
2819 None
2820 });
2821 if let Some(did) = candidate {
2822 files[file_idx].defs[def_idx].calls[call_idx].resolved = Some(did);
2823 }
2824 continue;
2825 }
2826
2827 // ── Priority 1: Qualified-path resolution ────────────────
2828 //
2829 // `qualified_path` carries the full scoped path (e.g. "mod_a::foo").
2830 // We look up candidates by the bare `call_name`, then filter by
2831 // whether the file path or scope contains the qualifier prefix.
2832 if let Some(ref qpath) = qualified_path {
2833 // Qualifier is everything before the final `::`.
2834 let qualifier = if let Some(pos) = qpath.rfind("::") {
2835 &qpath[..pos]
2836 } else {
2837 qpath.as_str()
2838 };
2839 let qual_segments: Vec<&str> = qualifier.split("::").collect();
2840
2841 let Some(candidates) = def_index.get(&call_name) else {
2842 continue;
2843 };
2844
2845 let matching: Vec<DefId> = candidates
2846 .iter()
2847 .copied()
2848 .filter(|&(f_idx, _)| {
2849 let file_path = &files[f_idx as usize].path;
2850 let last_segment = qual_segments.last().copied().unwrap_or("");
2851 let path_as_module =
2852 file_path.trim_end_matches(".rs").replace(['/', '\\'], "::");
2853 path_as_module.contains(last_segment)
2854 || file_path.contains(last_segment)
2855 })
2856 .collect();
2857
2858 if matching.len() == 1 {
2859 files[file_idx].defs[def_idx].calls[call_idx].resolved = Some(matching[0]);
2860 }
2861 // Ambiguous or no match → leave None.
2862 continue;
2863 }
2864
2865 // ── Priority 3.5: SQL suffix-match resolution ────────────
2866 //
2867 // dbt / sqlmesh pipelines use layered schema prefixes:
2868 // `silver_issuer_returns.sql` defines the silver-layer model.
2869 // `gold_issuer_returns.sql` references it via a FROM clause
2870 // that tree-sitter parses as `name: "issuer_returns"` (the
2871 // `name:` field-selector strips the `@{schema}.` prefix).
2872 // `def_index.get("issuer_returns")` returns None — the def is
2873 // stored under "silver_issuer_returns".
2874 //
2875 // When: (a) no exact-name candidate exists, AND (b) the
2876 // caller's enclosing def is a sql_file (whole-file synthetic
2877 // def emitted by `enrich_sql_file_def`), walk every sql_file
2878 // def in the graph and check whether its name ends with
2879 // `_<call_name>`. Unique suffix-match → resolve. Ambiguous
2880 // (e.g., both gold_ and silver_ match the same bare name) →
2881 // leave None (no silent first-wins).
2882 //
2883 // Non-sql_file callers are explicitly excluded: a Rust
2884 // function_item whose call_name happens to end with a suffix
2885 // of some sql_file def must NOT be resolved via this path.
2886 if !def_index.contains_key(&call_name)
2887 && files[file_idx].defs[def_idx].kind == "sql_file"
2888 && !call_name.is_empty()
2889 {
2890 let suffix = format!("_{call_name}");
2891 let suffix_str = suffix.as_str();
2892 // Exclude the caller def itself from the suffix scan:
2893 // `gold_issuer_returns` also ends with `_issuer_returns`
2894 // but must not self-resolve.
2895 #[expect(clippy::cast_possible_truncation, reason = "file index fits in u32")]
2896 let caller_did: DefId = (file_idx as u32, def_idx as u16);
2897 let suffix_matches: Vec<DefId> = files
2898 .iter()
2899 .enumerate()
2900 .flat_map(|(f_idx, f)| {
2901 f.defs.iter().enumerate().filter_map(move |(d_idx, d)| {
2902 #[expect(
2903 clippy::cast_possible_truncation,
2904 reason = "file and def indices fit in u32/u16"
2905 )]
2906 let did: DefId = (f_idx as u32, d_idx as u16);
2907 if d.kind == "sql_file"
2908 && d.name.ends_with(suffix_str)
2909 && did != caller_did
2910 {
2911 Some(did)
2912 } else {
2913 None
2914 }
2915 })
2916 })
2917 .collect();
2918 if suffix_matches.len() == 1 {
2919 files[file_idx].defs[def_idx].calls[call_idx].resolved =
2920 Some(suffix_matches[0]);
2921 }
2922 // Ambiguous (>1) or no match (0) → leave None.
2923 continue;
2924 }
2925
2926 let Some(candidates) = def_index.get(&call_name) else {
2927 continue;
2928 };
2929
2930 // ── Priority 2: Receiver-type resolution ─────────────────
2931 //
2932 // `receiver_type = Some("Foo")` means this is a method call on a
2933 // value whose type is `Foo`. Filter candidates to those whose scope
2934 // chain contains the receiver type name.
2935 if let Some(ref rtype) = receiver_type {
2936 // Candidates whose scope contains the receiver type name.
2937 let receiver_matching: Vec<DefId> = candidates
2938 .iter()
2939 .copied()
2940 .filter(|&(f_idx, d_idx)| {
2941 let scope = &files[f_idx as usize].defs[d_idx as usize].scope;
2942 scope.contains(rtype.as_str())
2943 })
2944 .collect();
2945
2946 if receiver_matching.len() == 1 {
2947 files[file_idx].defs[def_idx].calls[call_idx].resolved =
2948 Some(receiver_matching[0]);
2949 continue;
2950 }
2951
2952 if receiver_matching.len() > 1 {
2953 // Among receiver-matching candidates, prefer those in imported files.
2954 let imported_receiver_matching: Vec<DefId> = receiver_matching
2955 .iter()
2956 .copied()
2957 .filter(|(f, _)| imported_files[file_idx].contains(f))
2958 .collect();
2959 if imported_receiver_matching.len() == 1 {
2960 files[file_idx].defs[def_idx].calls[call_idx].resolved =
2961 Some(imported_receiver_matching[0]);
2962 }
2963 // Ambiguous even after import filter → leave None.
2964 continue;
2965 }
2966
2967 // ── Priority 2.5: Python MRO walk ─────────────────────
2968 //
2969 // The receiver type's own scope has no matching def — but
2970 // the method may live on an ancestor class. Walk the MRO
2971 // (left-first DFS) and try the scope-match against each
2972 // ancestor's name. First ancestor with at least one
2973 // scope-matching candidate wins; if multiple candidates
2974 // match for the same ancestor, prefer imported files,
2975 // else take the first in stable order.
2976 //
2977 // Liskov: a subclass's `self.method()` call must dispatch
2978 // through the MRO; over-approximating ancestors is the
2979 // correct conservative move for a reverse call graph.
2980 // For non-Python languages or Python receivers with no
2981 // recorded parents, `compute_python_mro` returns an
2982 // empty vector and this loop is a no-op.
2983 let mro = compute_python_mro(rtype, python_class_hierarchy);
2984 let mut resolved_via_mro: Option<DefId> = None;
2985 for ancestor in &mro {
2986 let ancestor_matching: Vec<DefId> = candidates
2987 .iter()
2988 .copied()
2989 .filter(|&(f_idx, d_idx)| {
2990 let scope = &files[f_idx as usize].defs[d_idx as usize].scope;
2991 scope.contains(ancestor.as_str())
2992 })
2993 .collect();
2994 if ancestor_matching.len() == 1 {
2995 resolved_via_mro = Some(ancestor_matching[0]);
2996 break;
2997 }
2998 if ancestor_matching.len() > 1 {
2999 // Prefer imported files among the ancestor matches.
3000 let imported_ancestor: Vec<DefId> = ancestor_matching
3001 .iter()
3002 .copied()
3003 .filter(|(f, _)| imported_files[file_idx].contains(f))
3004 .collect();
3005 if imported_ancestor.len() == 1 {
3006 resolved_via_mro = Some(imported_ancestor[0]);
3007 break;
3008 }
3009 // Ambiguous at this ancestor — pick the first
3010 // candidate in stable order. The MRO walk's job
3011 // is to find *an* implementing def for an
3012 // inherited call, not to compute the runtime
3013 // dispatch winner; any plausible candidate is
3014 // useful for the reverse call graph.
3015 resolved_via_mro = Some(ancestor_matching[0]);
3016 break;
3017 }
3018 }
3019 if let Some(did) = resolved_via_mro {
3020 files[file_idx].defs[def_idx].calls[call_idx].resolved = Some(did);
3021 continue;
3022 }
3023 // No receiver-matching candidates anywhere in the MRO →
3024 // fall through to bare-name resolution.
3025 }
3026
3027 // ── Priority 3: Same-file resolution ─────────────────────
3028 #[expect(clippy::cast_possible_truncation)]
3029 let file_idx_u32 = file_idx as u32;
3030 if let Some(&did) = candidates.iter().find(|(f, _)| *f == file_idx_u32) {
3031 files[file_idx].defs[def_idx].calls[call_idx].resolved = Some(did);
3032 continue;
3033 }
3034
3035 // ── Priority 4: Imported-file resolution ──────────────────
3036 let imported_candidates: Vec<DefId> = candidates
3037 .iter()
3038 .copied()
3039 .filter(|(f, _)| imported_files[file_idx].contains(f))
3040 .collect();
3041 if imported_candidates.len() == 1 {
3042 files[file_idx].defs[def_idx].calls[call_idx].resolved =
3043 Some(imported_candidates[0]);
3044 }
3045
3046 // ── Priority 6: Global-unique fallback ────────────────────────────
3047 //
3048 // When a bare call name maps to exactly one def in the entire graph
3049 // — regardless of file or import relationship — resolve to it. This
3050 // bridges trait-method dispatch (`Trait::method` called as bare
3051 // `method`) and constructors referenced across non-imported module
3052 // boundaries.
3053 //
3054 // Guard with `.is_none()` so we do not overwrite a Priority 5
3055 // (imported-file) resolution that already fired.
3056 if candidates.len() == 1
3057 && files[file_idx].defs[def_idx].calls[call_idx]
3058 .resolved
3059 .is_none()
3060 {
3061 files[file_idx].defs[def_idx].calls[call_idx].resolved = Some(candidates[0]);
3062 }
3063 // Priority 7: Ambiguous or unresolved → leave None.
3064 }
3065 }
3066 }
3067}
3068
3069/// Compute a prefix-sum offset table for flattening `DefId`s to linear indices.
3070fn def_offsets(files: &[FileNode]) -> Vec<usize> {
3071 let mut offsets = Vec::with_capacity(files.len() + 1);
3072 offsets.push(0);
3073 for file in files {
3074 offsets.push(offsets.last().unwrap() + file.defs.len());
3075 }
3076 offsets
3077}
3078
3079/// Flatten a `DefId` to a linear index using the offset table.
3080fn flatten_def_id(offsets: &[usize], did: DefId) -> usize {
3081 offsets[did.0 as usize] + did.1 as usize
3082}
3083
3084/// Build top-N caller and callee lists for each definition (flattened).
3085fn build_def_neighbor_lists(
3086 n: usize,
3087 edges: &[(u32, u32, u32)],
3088 offsets: &[usize],
3089) -> (Vec<Vec<DefId>>, Vec<Vec<DefId>>) {
3090 let mut incoming: Vec<Vec<(u32, u32)>> = vec![vec![]; n];
3091 let mut outgoing: Vec<Vec<(u32, u32)>> = vec![vec![]; n];
3092
3093 for &(src, dst, w) in edges {
3094 let (s, d) = (src as usize, dst as usize);
3095 if s < n && d < n {
3096 incoming[d].push((src, w));
3097 outgoing[s].push((dst, w));
3098 }
3099 }
3100
3101 // Convert flat index back to DefId
3102 let to_def_id = |flat: u32| -> DefId {
3103 let flat_usize = flat as usize;
3104 let file_idx = offsets.partition_point(|&o| o <= flat_usize) - 1;
3105 let def_idx = flat_usize - offsets[file_idx];
3106 #[expect(clippy::cast_possible_truncation)]
3107 (file_idx as u32, def_idx as u16)
3108 };
3109
3110 let callers = incoming
3111 .into_iter()
3112 .map(|mut v| {
3113 v.sort_by_key(|b| std::cmp::Reverse(b.1));
3114 v.truncate(MAX_NEIGHBORS);
3115 v.into_iter().map(|(idx, _)| to_def_id(idx)).collect()
3116 })
3117 .collect();
3118
3119 let callees = outgoing
3120 .into_iter()
3121 .map(|mut v| {
3122 v.sort_by_key(|b| std::cmp::Reverse(b.1));
3123 v.truncate(MAX_NEIGHBORS);
3124 v.into_iter().map(|(idx, _)| to_def_id(idx)).collect()
3125 })
3126 .collect();
3127
3128 (callers, callees)
3129}
3130
3131// ── PageRank ─────────────────────────────────────────────────────────
3132
3133/// Compute `PageRank` scores for a graph.
3134///
3135/// If `focus` is `Some(idx)`, computes topic-sensitive `PageRank` biased
3136/// toward file `idx`. Otherwise computes standard (uniform) `PageRank`.
3137///
3138/// Returns one score per node, summing to 1.0.
3139#[expect(
3140 clippy::cast_precision_loss,
3141 reason = "node count fits comfortably in f32"
3142)]
3143fn pagerank(n: usize, edges: &[(u32, u32, u32)], focus: Option<usize>) -> Vec<f32> {
3144 if n == 0 {
3145 return vec![];
3146 }
3147
3148 // Build adjacency: out_edges[src] = [(dst, weight)]
3149 let mut out_edges: Vec<Vec<(usize, f32)>> = vec![vec![]; n];
3150 let mut out_weight: Vec<f32> = vec![0.0; n];
3151
3152 for &(src, dst, w) in edges {
3153 let (s, d) = (src as usize, dst as usize);
3154 if s < n && d < n {
3155 #[expect(clippy::cast_possible_truncation, reason = "edge weights are small")]
3156 let wf = f64::from(w) as f32;
3157 out_edges[s].push((d, wf));
3158 out_weight[s] += wf;
3159 }
3160 }
3161
3162 // Personalization vector (Haveliwala 2002, topic-sensitive PageRank).
3163 //
3164 // When a focus file is specified, the teleportation distribution is split:
3165 // - PERSONALIZATION_ALPHA (0.15) concentrated on the focus node.
3166 // - (1 - PERSONALIZATION_ALPHA) = 0.85 spread uniformly over the
3167 // remaining (n - 1) other nodes.
3168 //
3169 // This gives the focus file a gentle bias over its neighbors without
3170 // collapsing every other file to an equal uniform floor. The resulting
3171 // ranks still vary across the corpus, so the caller sees a *neighborhood*
3172 // of semantically related files rebiased toward the focus (I#16 fix).
3173 //
3174 // For n == 1 there are no other nodes; the focus gets all mass (= 1.0).
3175 let bias: Vec<f32> = if let Some(idx) = focus {
3176 if n == 1 {
3177 vec![1.0_f32]
3178 } else {
3179 let other_mass = (1.0_f32 - PERSONALIZATION_ALPHA) / (n as f32 - 1.0);
3180 let mut b = vec![other_mass; n];
3181 if idx < n {
3182 b[idx] = PERSONALIZATION_ALPHA;
3183 }
3184 // Verify sum ≈ 1.0 (should hold by construction; normalization
3185 // guards against floating-point drift on very large graphs).
3186 let sum: f32 = b.iter().sum();
3187 for v in &mut b {
3188 *v /= sum;
3189 }
3190 b
3191 }
3192 } else {
3193 vec![1.0 / n as f32; n]
3194 };
3195
3196 let mut rank = vec![1.0 / n as f32; n];
3197 let mut next_rank = vec![0.0_f32; n];
3198
3199 for _ in 0..MAX_ITERATIONS {
3200 // Collect dangling mass (nodes with no outgoing edges)
3201 let dangling: f32 = rank
3202 .iter()
3203 .enumerate()
3204 .filter(|&(i, _)| out_edges[i].is_empty())
3205 .map(|(_, &r)| r)
3206 .sum();
3207
3208 // Distribute rank
3209 for (i, nr) in next_rank.iter_mut().enumerate() {
3210 *nr = (1.0 - DAMPING).mul_add(bias[i], DAMPING * dangling * bias[i]);
3211 }
3212
3213 for (src, edges_list) in out_edges.iter().enumerate() {
3214 if edges_list.is_empty() {
3215 continue;
3216 }
3217 let src_rank = rank[src];
3218 let total_w = out_weight[src];
3219 for &(dst, w) in edges_list {
3220 next_rank[dst] += DAMPING * src_rank * (w / total_w);
3221 }
3222 }
3223
3224 // Check convergence
3225 let diff: f32 = rank
3226 .iter()
3227 .zip(next_rank.iter())
3228 .map(|(a, b)| (a - b).abs())
3229 .sum();
3230
3231 std::mem::swap(&mut rank, &mut next_rank);
3232
3233 if diff < EPSILON {
3234 break;
3235 }
3236 }
3237
3238 rank
3239}
3240
3241// ── Graph Building ───────────────────────────────────────────────────
3242
3243/// Intermediate result from definition-level graph computation.
3244struct DefGraphData {
3245 def_edges: Vec<(DefId, DefId, u32)>,
3246 def_ranks: Vec<f32>,
3247 def_callers: Vec<Vec<DefId>>,
3248 def_callees: Vec<Vec<DefId>>,
3249 offsets: Vec<usize>,
3250 base_ranks: Vec<f32>,
3251 file_edges: Vec<(u32, u32, u32)>,
3252}
3253
3254/// Build bidirectional trait↔impl method edges for PageRank propagation (G3).
3255///
3256/// For every impl method that overrides a trait method, adds:
3257/// - `(impl_def_id, trait_def_id, 1)` — impl → trait
3258/// - `(trait_def_id, impl_def_id, 1)` — trait → impl
3259///
3260/// Detection heuristic: an impl method "overrides" a trait method when:
3261/// - The impl method's kind is `"function_item"` and its `scope` starts with
3262/// `"impl_item"`.
3263/// - The trait method's kind is `"function_signature_item"` and its `scope`
3264/// starts with `"trait_item"`.
3265/// - Both have the same `name`.
3266/// - The impl's file imports the trait's file (or they share a file).
3267///
3268/// This is heuristic, not sound: it may produce false positives when two
3269/// unrelated traits define methods with the same name. The practical false-
3270/// positive rate on real Rust codebases is low because method names are
3271/// usually unique within a crate.
3272#[must_use]
3273pub fn build_trait_impl_edges_pub(files: &[FileNode]) -> Vec<(DefId, DefId, u32)> {
3274 build_trait_impl_edges(files)
3275}
3276
3277fn build_trait_impl_edges(files: &[FileNode]) -> Vec<(DefId, DefId, u32)> {
3278 // Build index: method_name → list of (DefId, is_trait_method).
3279 // trait method: kind == "function_signature_item" (abstract) OR scope contains "trait_item".
3280 // impl method: kind == "function_item" AND scope contains "impl_item".
3281 let mut trait_methods: HashMap<String, Vec<DefId>> = HashMap::new();
3282 let mut impl_methods: HashMap<String, Vec<DefId>> = HashMap::new();
3283
3284 for (fi, file) in files.iter().enumerate() {
3285 for (di, def) in file.defs.iter().enumerate() {
3286 #[expect(clippy::cast_possible_truncation)]
3287 let did: DefId = (fi as u32, di as u16);
3288 if def.kind == "function_signature_item"
3289 || (def.scope.starts_with("trait_item") && def.kind == "function_item")
3290 {
3291 trait_methods.entry(def.name.clone()).or_default().push(did);
3292 } else if def.kind == "function_item" && def.scope.starts_with("impl_item") {
3293 impl_methods.entry(def.name.clone()).or_default().push(did);
3294 }
3295 }
3296 }
3297
3298 // Pre-build imported-files sets to restrict matching.
3299 let imported_sets: Vec<std::collections::HashSet<u32>> = files
3300 .iter()
3301 .map(|f| {
3302 f.imports
3303 .iter()
3304 .filter_map(|imp| imp.resolved_idx)
3305 .collect()
3306 })
3307 .collect();
3308
3309 let mut edges: Vec<(DefId, DefId, u32)> = Vec::new();
3310
3311 for (name, trait_defs) in &trait_methods {
3312 let Some(impl_defs) = impl_methods.get(name) else {
3313 continue;
3314 };
3315 for &(tf, td) in trait_defs {
3316 for &(imf, imd) in impl_defs {
3317 // The impl file must import the trait file (or be the same file).
3318 let connected = tf == imf
3319 || imported_sets
3320 .get(imf as usize)
3321 .is_some_and(|s| s.contains(&tf));
3322 if connected {
3323 let trait_id: DefId = (tf, td);
3324 let impl_id: DefId = (imf, imd);
3325 edges.push((trait_id, impl_id, 1));
3326 edges.push((impl_id, trait_id, 1));
3327 }
3328 }
3329 }
3330 }
3331
3332 edges
3333}
3334
3335/// Build definition-level edges, compute `PageRank`, and derive file-level data.
3336fn compute_def_graph(files: &[FileNode]) -> DefGraphData {
3337 // Build definition-level edge list from resolved calls
3338 let mut def_edge_map: HashMap<(DefId, DefId), u32> = HashMap::new();
3339 for (file_idx, file) in files.iter().enumerate() {
3340 for (def_idx, def) in file.defs.iter().enumerate() {
3341 #[expect(clippy::cast_possible_truncation)]
3342 let caller_id: DefId = (file_idx as u32, def_idx as u16);
3343 for call in &def.calls {
3344 if let Some(callee_id) = call.resolved {
3345 *def_edge_map.entry((caller_id, callee_id)).or_insert(0) += 1;
3346 }
3347 }
3348 }
3349 }
3350
3351 // Add trait↔impl bidirectional edges (G3).
3352 let trait_impl_edges = build_trait_impl_edges(files);
3353 for (src, dst, w) in trait_impl_edges {
3354 *def_edge_map.entry((src, dst)).or_insert(0) += w;
3355 }
3356
3357 let def_edges: Vec<(DefId, DefId, u32)> = def_edge_map
3358 .into_iter()
3359 .map(|((src, dst), w)| (src, dst, w))
3360 .collect();
3361
3362 // Compute def-level PageRank
3363 let offsets = def_offsets(files);
3364 let n_defs = *offsets.last().unwrap_or(&0);
3365
3366 let flat_def_edges: Vec<(u32, u32, u32)> = def_edges
3367 .iter()
3368 .map(|(src, dst, w)| {
3369 #[expect(clippy::cast_possible_truncation)]
3370 (
3371 flatten_def_id(&offsets, *src) as u32,
3372 flatten_def_id(&offsets, *dst) as u32,
3373 *w,
3374 )
3375 })
3376 .collect();
3377
3378 let def_ranks = pagerank(n_defs, &flat_def_edges, None);
3379
3380 // Derive file-level edges from def-level call edges. A cross-file def→def
3381 // edge contributes one file→file edge in the same direction; intra-file
3382 // edges are skipped (they cannot move rank between files).
3383 let mut file_edge_map: HashMap<(u32, u32), u32> = HashMap::new();
3384 for &(src, dst, w) in &def_edges {
3385 let src_file = src.0;
3386 let dst_file = dst.0;
3387 if src_file != dst_file {
3388 *file_edge_map.entry((src_file, dst_file)).or_insert(0) += w;
3389 }
3390 }
3391 let file_edges: Vec<(u32, u32, u32)> = file_edge_map
3392 .into_iter()
3393 .map(|((src, dst), w)| (src, dst, w))
3394 .collect();
3395
3396 // File-level rank: run PageRank directly on the file-level edge graph
3397 // (B-0025 fix, Cycle 11). The earlier formulation aggregated file rank
3398 // by summing per-def ranks of every def in the file, which amplified
3399 // the teleportation floor for files with high def cardinality but zero
3400 // in-degree (test files in particular). Running PageRank on
3401 // `file_edges` makes file rank obey the user-visible invariant: rank
3402 // flows from caller-file to callee-file, never from callee to caller.
3403 //
3404 // A file with no incoming file→file edges receives only the
3405 // teleportation floor `(1 - DAMPING) / n_files`, regardless of how
3406 // many defs it contains. Test files (which call hubs but are called
3407 // by no one) now correctly drop to the floor.
3408 let n_files = files.len();
3409 let base_ranks: Vec<f32> = if n_files == 0 {
3410 Vec::new()
3411 } else {
3412 pagerank(n_files, &file_edges, None)
3413 };
3414
3415 // Build def-level caller/callee lists
3416 let (def_callers, def_callees) = build_def_neighbor_lists(n_defs, &flat_def_edges, &offsets);
3417
3418 DefGraphData {
3419 def_edges,
3420 def_ranks,
3421 def_callers,
3422 def_callees,
3423 offsets,
3424 base_ranks,
3425 file_edges,
3426 }
3427}
3428
3429/// Build a dependency graph from a repository root.
3430///
3431/// Walks the directory tree, parses each supported file with tree-sitter,
3432/// extracts definitions and imports, resolves import paths to files, runs
3433/// `PageRank`, and builds caller/callee lists.
3434///
3435/// # Errors
3436///
3437/// Returns an error if file walking or reading fails.
3438#[expect(
3439 clippy::too_many_lines,
3440 reason = "three-phase parallel pipeline (walk+filter, def+import extraction, call extraction) \
3441 plus resolve + graph build; phases share state (file_index, raw_sources) and \
3442 cannot be meaningfully split without passing large mutable structures across \
3443 boundaries with no clarity gain"
3444)]
3445pub fn build_graph(root: &Path) -> crate::Result<RepoGraph> {
3446 let root = root.canonicalize().map_err(|e| crate::Error::Io {
3447 path: root.display().to_string(),
3448 source: e,
3449 })?;
3450
3451 let mut walk_options = walk::WalkOptions::default();
3452 if let Some((_, config)) = crate::cache::config::find_config(&root) {
3453 walk_options.ignore_patterns = config.ignore.patterns;
3454 }
3455 let all_files = walk::collect_files_with_options(&root, &walk_options);
3456
3457 // Phase 1: parallel filter + read. For each candidate path with a
3458 // supported extension, read its source from disk and emit a tuple
3459 // alongside its relative path. rayon spreads the I/O cost across
3460 // worker threads; on a 1M-file corpus this was ~20s sequential and
3461 // now sits in the 2-3s range bounded by disk + filter throughput.
3462 let raw_inputs: Vec<(PathBuf, String, String, String)> = all_files
3463 .par_iter()
3464 .filter_map(|path| {
3465 let ext = path
3466 .extension()
3467 .and_then(|e| e.to_str())
3468 .unwrap_or_default()
3469 .to_string();
3470 if languages::config_for_extension(&ext).is_none()
3471 && import_query_for_extension(&ext).is_none()
3472 {
3473 return None;
3474 }
3475 let source = std::fs::read_to_string(path).ok()?;
3476 let rel_path = path
3477 .strip_prefix(&root)
3478 .unwrap_or(path)
3479 .display()
3480 .to_string();
3481 Some((path.clone(), rel_path, ext, source))
3482 })
3483 .collect();
3484
3485 // Build the contiguous `files` Vec and the absolute-path -> idx
3486 // lookup. Sequential because both want stable indices that match
3487 // `raw_sources`'s order; the per-file work this gates is trivial.
3488 let mut file_index: HashMap<PathBuf, usize> = HashMap::with_capacity(raw_inputs.len());
3489 let mut files: Vec<FileNode> = Vec::with_capacity(raw_inputs.len());
3490 let mut raw_sources: Vec<(usize, String, String)> = Vec::with_capacity(raw_inputs.len());
3491 for (idx, (abs_path, rel_path, ext, source)) in raw_inputs.into_iter().enumerate() {
3492 file_index.insert(abs_path, idx);
3493 files.push(FileNode {
3494 path: rel_path,
3495 defs: vec![],
3496 imports: vec![],
3497 });
3498 raw_sources.push((idx, ext, source));
3499 }
3500
3501 // Phase 2: parallel per-file definition + import extraction. Each
3502 // file's tree-sitter parse + def/import queries are independent;
3503 // par_iter_mut over files.iter_mut().zip(raw_sources.par_iter())
3504 // lets every rayon worker grind its own slice. The closures here
3505 // borrow `&root` and `&file_index` immutably (both Sync) and write
3506 // disjoint `FileNode` slots via the &mut iterator.
3507 files
3508 .par_iter_mut()
3509 .zip(raw_sources.par_iter())
3510 .for_each(|(file, (_, ext, source))| {
3511 if let Some(config) = languages::config_for_extension(ext) {
3512 file.defs = extract_definitions(source, &config);
3513 // Go method_declaration scopes are empty after the generic
3514 // extract_definitions pass (the method is a top-level node
3515 // with no structural parent in CONTAINER_KINDS). Enrich them
3516 // with the receiver type so that resolve_calls Priority 2
3517 // (scope.contains(recv_type)) fires correctly for cross-file
3518 // Go receiver-method calls. This populates def_callers[] for
3519 // Go in compute_def_graph (P1 fix).
3520 if languages::is_go_language(&config.language) {
3521 enrich_go_method_def_scopes(source, &mut file.defs);
3522 }
3523 // SQL: prepend a synthetic file-level def named after the
3524 // filename stem (dbt/sqlmesh convention). The whole-file
3525 // byte range becomes the smallest-enclosing fallback for
3526 // FROM/JOIN call-edges that are not inside any CTE, which
3527 // is the resolution target for cross-model references
3528 // (S1, Wave 4). file.path is relative to the repo root and
3529 // is what file_stem() needs to derive the model name.
3530 if languages::is_sql_language(&config.language) {
3531 enrich_sql_file_def(&file.path, source, &mut file.defs);
3532 }
3533 }
3534 if let Some((lang, import_query)) = import_query_for_extension(ext) {
3535 let raw_imports = extract_imports(source, &lang, &import_query);
3536 let file_path = root.join(&file.path);
3537 file.imports = raw_imports
3538 .into_iter()
3539 .map(|raw| {
3540 let resolved_idx =
3541 resolve_import(&raw, ext, &file_path, &root, &file_index)
3542 .and_then(|i| u32::try_from(i).ok());
3543 ImportRef {
3544 raw_path: raw,
3545 resolved_idx,
3546 }
3547 })
3548 .collect();
3549 }
3550 });
3551
3552 // Phase 3: parallel per-file call extraction. Mutates each
3553 // FileNode's `defs[*].calls` independently. Aligned with
3554 // raw_sources by index via the zip.
3555 files
3556 .par_iter_mut()
3557 .zip(raw_sources.par_iter())
3558 .for_each(|(file, (_, ext, source))| {
3559 if let Some(call_config) = languages::call_query_for_extension(ext) {
3560 extract_calls(source, &call_config, &mut file.defs);
3561 }
3562 });
3563
3564 // Build the Python class hierarchy (class_name → parent class names) by
3565 // walking every Python source file. The map is used by `resolve_calls`
3566 // Priority 2.5 to dispatch `self.method()` calls through the MRO when
3567 // the method lives on a parent / mixin class (Q1, Wave 2).
3568 //
3569 // Parallel: extract_python_class_hierarchy is pure per-file, then we
3570 // fold the per-file maps into one global map sequentially because
3571 // HashMap is not lock-free. On a 1k-Python-file corpus this fold takes
3572 // <10ms — much smaller than the parallel parse work that feeds it.
3573 let python_hierarchies: Vec<HashMap<String, Vec<String>>> = raw_sources
3574 .par_iter()
3575 .map(|(_, ext, source)| {
3576 if ext == "py" || ext == "pyi" {
3577 extract_python_class_hierarchy(source)
3578 } else {
3579 HashMap::new()
3580 }
3581 })
3582 .collect();
3583 let mut python_class_hierarchy: HashMap<String, Vec<String>> = HashMap::new();
3584 for local in python_hierarchies {
3585 for (k, v) in local {
3586 // First declaration wins on name collisions across files. The
3587 // MRO walk only needs a plausible parent chain to find an
3588 // ancestor's methods; this is conservative but acceptable.
3589 python_class_hierarchy.entry(k).or_insert(v);
3590 }
3591 }
3592
3593 // Resolve call references to target definitions
3594 let def_index = build_def_index(&files);
3595 resolve_calls(&mut files, &def_index, &python_class_hierarchy);
3596
3597 // Build def-level graph, compute PageRank, and derive file-level data
3598 let graph_data = compute_def_graph(&files);
3599
3600 // Build file-level caller/callee lists
3601 let n = files.len();
3602 let (callers, callees) = build_neighbor_lists(n, &graph_data.file_edges);
3603
3604 // Auto-tune alpha based on graph density
3605 #[expect(clippy::cast_precision_loss, reason = "graph sizes fit in f32")]
3606 let density = if n > 1 {
3607 graph_data.file_edges.len() as f32 / (n as f32 * (n as f32 - 1.0))
3608 } else {
3609 0.0
3610 };
3611 let alpha = 0.3f32.mul_add(density.min(1.0), 0.5);
3612
3613 Ok(RepoGraph {
3614 files,
3615 edges: graph_data.file_edges,
3616 base_ranks: graph_data.base_ranks,
3617 callers,
3618 callees,
3619 def_edges: graph_data.def_edges,
3620 def_ranks: graph_data.def_ranks,
3621 def_callers: graph_data.def_callers,
3622 def_callees: graph_data.def_callees,
3623 def_offsets: graph_data.offsets,
3624 alpha,
3625 })
3626}
3627
3628/// Build a `RepoGraph` directly from a pre-constructed `Vec<FileNode>`.
3629///
3630/// Skips the filesystem walk phase of [`build_graph`]; useful for integration
3631/// tests that want to build synthetic graphs without touching disk.
3632///
3633/// Resolves calls, builds the def-level graph (including G3 trait↔impl edges),
3634/// computes `PageRank`, and builds caller/callee lists.
3635#[must_use]
3636pub fn build_graph_from_files_pub(files: Vec<FileNode>) -> RepoGraph {
3637 let empty_hierarchy: HashMap<String, Vec<String>> = HashMap::new();
3638 build_graph_from_files_with_mro_pub(files, &empty_hierarchy)
3639}
3640
3641/// Build a `RepoGraph` from synthetic files with Python MRO resolution enabled.
3642///
3643/// Identical to [`build_graph_from_files_pub`] except that the resolver's
3644/// Priority 2.5 (MRO walk) fires against the caller-supplied
3645/// `python_class_hierarchy` map. Use this when an integration test needs to
3646/// exercise `self.method()` dispatch through inherited classes — including
3647/// the inverse-edge propagation into [`RepoGraph::def_callers`] that
3648/// `lsp_incoming_calls` consumes.
3649///
3650/// I#58 / 4.1.3: when a `SubScreen.method_caller` calls `self.method()` and
3651/// the MRO walk binds the call to `Mixin.method`, the forward edge
3652/// `(SubScreen.method_caller, Mixin.method)` is recorded in
3653/// [`DefGraphData::def_edges`]. [`build_def_neighbor_lists`] then derives
3654/// symmetric inverse-edge entries: `def_callers[Mixin.method]` includes
3655/// `SubScreen.method_caller`. This is the property `lsp_incoming_calls` reads.
3656#[must_use]
3657pub fn build_graph_from_files_with_mro_pub<H>(
3658 mut files: Vec<FileNode>,
3659 python_class_hierarchy: &HashMap<String, Vec<String>, H>,
3660) -> RepoGraph
3661where
3662 H: std::hash::BuildHasher,
3663{
3664 let def_index = build_def_index(&files);
3665 resolve_calls(&mut files, &def_index, python_class_hierarchy);
3666 let graph_data = compute_def_graph(&files);
3667 let n = files.len();
3668 let (callers, callees) = build_neighbor_lists(n, &graph_data.file_edges);
3669
3670 #[expect(clippy::cast_precision_loss, reason = "graph sizes fit in f32")]
3671 let density = if n > 1 {
3672 graph_data.file_edges.len() as f32 / (n as f32 * (n as f32 - 1.0))
3673 } else {
3674 0.0
3675 };
3676 let alpha = 0.3f32.mul_add(density.min(1.0), 0.5);
3677
3678 RepoGraph {
3679 files,
3680 edges: graph_data.file_edges,
3681 base_ranks: graph_data.base_ranks,
3682 callers,
3683 callees,
3684 def_edges: graph_data.def_edges,
3685 def_ranks: graph_data.def_ranks,
3686 def_callers: graph_data.def_callers,
3687 def_callees: graph_data.def_callees,
3688 def_offsets: graph_data.offsets,
3689 alpha,
3690 }
3691}
3692
3693// ── Dead-code analysis ───────────────────────────────────────────────────────
3694
3695/// Global flat definition index: `def_offsets[file_idx] + def_idx_within_file`.
3696///
3697/// This is the natural index into [`RepoGraph::def_ranks`],
3698/// [`RepoGraph::def_callers`], and [`RepoGraph::def_callees`].
3699pub type DefIndex = usize;
3700
3701/// A connected component in the dead-code subgraph.
3702///
3703/// All members are definitions that are unreachable from any entry point.
3704/// The component is formed by treating `def_callees + def_callers` edges as
3705/// undirected — so a transitively-dead group (including mutual-recursion
3706/// cycles that are collectively unreachable) surfaces as one cluster.
3707#[derive(Debug, Clone)]
3708pub struct DeadCluster {
3709 /// Global flat def index of the cluster root (the member with the
3710 /// highest [`RepoGraph::def_ranks`] score).
3711 pub root_def_idx: usize,
3712 /// Number of definitions in this cluster.
3713 pub size: usize,
3714 /// Sum of `end_line - start_line` for every member definition.
3715 pub total_lines: usize,
3716 /// All member global def indices, root first.
3717 pub member_def_indices: Vec<usize>,
3718}
3719
3720/// Confidence level for [`DeadCodeReport::dead_fraction`].
3721///
3722/// Callers should interpret `dead_fraction` according to this indicator.
3723/// When entry-point coverage is sparse (common in test-heavy corpora or
3724/// macro-dispatched frameworks), the raw fraction can be misleading — the
3725/// cluster list is always more trustworthy than the absolute number.
3726///
3727/// ## Rubric
3728///
3729/// | Level | Condition |
3730/// |--------|------------------------------------------------------------------------|
3731/// | High | entry coverage ≥ 10 % AND ≥ 1 LibraryExport AND ≥ 1 Main |
3732/// | Medium | entry coverage ≥ 2 % AND ≥ 1 production entry AND tests not dominant |
3733/// | Low | entry coverage < 2 % OR tests > 80 % of all entries |
3734///
3735/// "Production entries" = LibraryExport + FrameworkDispatched + Main + Ffi.
3736#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
3737#[serde(rename_all = "snake_case")]
3738pub enum DeadCodeConfidence {
3739 /// Entry-point coverage is dense; `dead_fraction` is a quantitative signal.
3740 High,
3741 /// `dead_fraction` is directional; trust the cluster list more than the number.
3742 Medium,
3743 /// `dead_fraction` may mislead (e.g., tests-dominated corpus); use cluster
3744 /// `member_defs` + `lsp_references` to verify each candidate before deletion.
3745 Low,
3746}
3747
3748/// Compute a [`DeadCodeConfidence`] level from entry-point kind counts.
3749///
3750/// Parameters mirror the counts emitted by
3751/// [`crate::entry_points::summarize_entry_point_kinds`]:
3752///
3753/// - `total_defs` — total number of definitions in the graph.
3754/// - `library_exports` — count of [`crate::entry_points::EntryPointKind::LibraryExport`] entries.
3755/// - `framework_dispatched` — count of [`crate::entry_points::EntryPointKind::FrameworkDispatched`] entries.
3756/// - `main_entries` — count of [`crate::entry_points::EntryPointKind::Main`] entries.
3757/// - `test_entries` — count of [`crate::entry_points::EntryPointKind::Test`] entries.
3758/// - `ffi_entries` — count of [`crate::entry_points::EntryPointKind::Ffi`] entries.
3759///
3760/// ProcMacro / Init / BuildScript entries are not passed separately; they are
3761/// rare enough that their absence does not materially affect the heuristic.
3762#[must_use]
3763#[expect(
3764 clippy::cast_precision_loss,
3765 reason = "entry and def counts fit comfortably in f64 for ratio computation"
3766)]
3767pub fn compute_confidence(
3768 total_defs: usize,
3769 library_exports: usize,
3770 framework_dispatched: usize,
3771 main_entries: usize,
3772 test_entries: usize,
3773 ffi_entries: usize,
3774) -> DeadCodeConfidence {
3775 let total_entries =
3776 library_exports + framework_dispatched + main_entries + test_entries + ffi_entries;
3777 let entry_coverage = if total_defs > 0 {
3778 total_entries as f64 / total_defs as f64
3779 } else {
3780 0.0
3781 };
3782 let test_dominant = total_entries > 0 && (test_entries as f64 / total_entries as f64) > 0.8;
3783 let production_entries = library_exports + framework_dispatched + main_entries + ffi_entries;
3784
3785 if entry_coverage >= 0.10 && library_exports >= 1 && main_entries >= 1 {
3786 DeadCodeConfidence::High
3787 } else if entry_coverage >= 0.02 && production_entries >= 1 && !test_dominant {
3788 DeadCodeConfidence::Medium
3789 } else {
3790 DeadCodeConfidence::Low
3791 }
3792}
3793
3794/// Summary report from [`compute_dead_code`].
3795///
3796/// The primary consumer is X3's `mcp__ripvec__find_dead_code` MCP tool.
3797#[derive(Debug, Clone)]
3798pub struct DeadCodeReport {
3799 /// Dead clusters sorted by size descending (largest first).
3800 pub dead_clusters: Vec<DeadCluster>,
3801 /// Total number of definitions unreachable from any entry point.
3802 pub total_dead_defs: usize,
3803 /// Total number of definitions reachable from at least one entry point.
3804 pub total_live_defs: usize,
3805 /// Fraction of all definitions that are dead: `dead / (dead + live)`.
3806 pub dead_fraction: f32,
3807 /// Confidence that `dead_fraction` is a reliable signal.
3808 ///
3809 /// Defaults to [`DeadCodeConfidence::Low`] until the caller supplies
3810 /// entry-point kind counts via [`compute_confidence`].
3811 pub confidence: DeadCodeConfidence,
3812}
3813
3814/// Returns true if the given path is a test-related path.
3815///
3816/// Heuristic: the path contains `tests/`, `/spec/`, `/specs/`, or the
3817/// file stem starts/ends with `test_`/`_test` or contains `bench`.
3818fn is_test_path(path: &str) -> bool {
3819 let path_lc = path.to_lowercase();
3820 if path_lc.contains("tests/") || path_lc.contains("/spec/") || path_lc.contains("/specs/") {
3821 return true;
3822 }
3823 let file_name = path.rsplit('/').next().unwrap_or(path);
3824 let stem = file_name.split('.').next().unwrap_or(file_name);
3825 stem.starts_with("test_") || stem.ends_with("_test") || stem.contains("bench")
3826}
3827
3828/// Resolve a flat [`DefIndex`] to its file index using the `def_offsets`
3829/// prefix-sum table.
3830fn flat_to_file_idx(offsets: &[usize], flat: DefIndex) -> usize {
3831 offsets.partition_point(|&o| o <= flat).saturating_sub(1)
3832}
3833
3834/// Union-find: path-compressing find. Returns the root representative of `x`.
3835fn uf_find(parent: &mut Vec<usize>, x: usize) -> usize {
3836 if parent[x] != x {
3837 parent[x] = uf_find(parent, parent[x]);
3838 }
3839 parent[x]
3840}
3841
3842/// Union-find: merge the components containing `x` and `y`.
3843fn uf_union(parent: &mut Vec<usize>, x: usize, y: usize) {
3844 let rx = uf_find(parent, x);
3845 let ry = uf_find(parent, y);
3846 if rx != ry {
3847 parent[rx] = ry;
3848 }
3849}
3850
3851/// Compute the set of definitions unreachable from any entry point.
3852///
3853/// Returns dead clusters (connected components in the unreachable subgraph),
3854/// sorted by size descending.
3855///
3856/// # Algorithm
3857///
3858/// 1. Optionally filter test paths from `entry_def_indices` (when
3859/// `include_test_paths` is `false`). Test-path heuristic: the file path
3860/// contains `test_`, `_test`, `tests/`, `spec/`, `specs/`, or `bench`.
3861/// 2. Build full forward + reverse adjacency from [`RepoGraph::def_edges`]
3862/// (the untruncated edge list) as a Compressed Sparse Row (CSR) pair
3863/// of (`starts`, `dst`) `Vec<u32>` arrays — bounded O(E) memory rather
3864/// than the O(n_defs * avg_fanout) of duplicated `Vec<Vec<DefIndex>>`
3865/// that crashed at kernel scale (I#61). The BFS does NOT use
3866/// [`RepoGraph::def_callees`] / [`RepoGraph::def_callers`]: those are
3867/// display-oriented neighbor lists capped at [`MAX_NEIGHBORS`] per node,
3868/// which dropped hub callees from the live set and produced
3869/// false-positive dead reports (I#57).
3870/// 3. BFS forward over the full forward adjacency from the entry seeds ->
3871/// reachable set.
3872/// 4. Complement `(all_defs - reachable)` = dead set.
3873/// 5. Connected-components on the dead subgraph via the full forward +
3874/// reverse adjacency treated as undirected (union-find).
3875/// 5. For each cluster: pick the highest-rank def as the cluster root;
3876/// `size` = member count; `total_lines` = sum of `(end_line -
3877/// start_line)` per member.
3878/// 6. Sort clusters by size descending.
3879#[must_use]
3880#[expect(
3881 clippy::too_many_lines,
3882 reason = "six-step BFS+clustering pipeline; splitting into sub-functions \
3883 would require passing many interdependent Vec borrows with no \
3884 clarity gain"
3885)]
3886pub fn compute_dead_code<S: std::hash::BuildHasher>(
3887 graph: &RepoGraph,
3888 entry_def_indices: &HashSet<DefIndex, S>,
3889 include_test_paths: bool,
3890) -> DeadCodeReport {
3891 let n_defs = graph.def_ranks.len();
3892 if n_defs == 0 {
3893 return DeadCodeReport {
3894 dead_clusters: vec![],
3895 total_dead_defs: 0,
3896 total_live_defs: 0,
3897 dead_fraction: 0.0,
3898 confidence: DeadCodeConfidence::Low,
3899 };
3900 }
3901
3902 // Step 1: build the effective seed set.
3903 let seeds: Vec<DefIndex> = if include_test_paths {
3904 entry_def_indices.iter().copied().collect()
3905 } else {
3906 entry_def_indices
3907 .iter()
3908 .copied()
3909 .filter(|&flat| {
3910 let file_idx = flat_to_file_idx(&graph.def_offsets, flat);
3911 let path = graph
3912 .files
3913 .get(file_idx)
3914 .map(|f| f.path.as_str())
3915 .unwrap_or("");
3916 !is_test_path(path)
3917 })
3918 .collect()
3919 };
3920
3921 // Build CSR-style forward + reverse adjacency from `graph.def_edges`.
3922 //
3923 // We do NOT use `graph.def_callees` / `graph.def_callers` here because
3924 // those are display-oriented neighbor lists truncated to MAX_NEIGHBORS
3925 // in `build_def_neighbor_lists`. Reading a truncated forward list during
3926 // BFS produced false-positive dead reports (I#57): a hub function with
3927 // more than 5 outgoing call edges silently dropped its low-weight
3928 // callees from the live set, and every helper transitively reachable
3929 // only through those dropped edges was marked dead.
3930 //
3931 // The original I#57 fix materialized `full_callees` / `full_callers` as
3932 // two `Vec<Vec<DefIndex>>` tables, which at kernel scale (n_defs ≈
3933 // 600 K with high-fan-out hubs) cost ~1.9 GB and OOM-crashed the MCP
3934 // server on every parameter variant (I#61, regression).
3935 //
3936 // This implementation uses Compressed Sparse Row (CSR) bucketing:
3937 // - `fwd_starts[i..=i+1]` slices `fwd_dst` to the destinations of node i.
3938 // - `rev_starts[i..=i+1]` slices `rev_src` to the sources that call i.
3939 // Storage is bounded O(E) (one u32 per directed edge × 2 directions)
3940 // plus O(n_defs) for the two start arrays. For a 600 K-def / 5 M-edge
3941 // kernel graph this is ~80 MB rather than ~1.9 GB.
3942 //
3943 // u32 is sufficient: every flat DefIndex value is < n_defs and ripvec
3944 // already constrains `n_defs <= u32::MAX` upstream (DefId.0 is u32).
3945 #[expect(
3946 clippy::cast_possible_truncation,
3947 reason = "n_defs <= u32::MAX is a ripvec-wide invariant; the per-edge \
3948 cast cannot overflow"
3949 )]
3950 let (fwd_starts, fwd_dst, rev_starts, rev_src) = {
3951 // First pass: per-node out-degree (forward) and in-degree (reverse).
3952 let mut fwd_deg: Vec<u32> = vec![0; n_defs];
3953 let mut rev_deg: Vec<u32> = vec![0; n_defs];
3954 for &(src, dst, _w) in &graph.def_edges {
3955 let src_flat = graph.def_offsets[src.0 as usize] + src.1 as usize;
3956 let dst_flat = graph.def_offsets[dst.0 as usize] + dst.1 as usize;
3957 if src_flat < n_defs && dst_flat < n_defs {
3958 fwd_deg[src_flat] += 1;
3959 rev_deg[dst_flat] += 1;
3960 }
3961 }
3962 // Exclusive prefix sums -> bucket starts of length n_defs + 1.
3963 let mut fwd_starts: Vec<u32> = Vec::with_capacity(n_defs + 1);
3964 let mut rev_starts: Vec<u32> = Vec::with_capacity(n_defs + 1);
3965 let mut acc_f: u32 = 0;
3966 let mut acc_r: u32 = 0;
3967 fwd_starts.push(0);
3968 rev_starts.push(0);
3969 for i in 0..n_defs {
3970 acc_f = acc_f.saturating_add(fwd_deg[i]);
3971 acc_r = acc_r.saturating_add(rev_deg[i]);
3972 fwd_starts.push(acc_f);
3973 rev_starts.push(acc_r);
3974 }
3975 // Second pass: place each edge using a per-node cursor.
3976 let total_edges = acc_f as usize;
3977 let mut fwd_dst: Vec<u32> = vec![0u32; total_edges];
3978 let mut rev_src: Vec<u32> = vec![0u32; total_edges];
3979 // Reuse fwd_deg / rev_deg as cursors counting down from per-bucket capacity.
3980 // Easier: cursors start at the bucket start.
3981 let mut fwd_cursor: Vec<u32> = fwd_starts[..n_defs].to_vec();
3982 let mut rev_cursor: Vec<u32> = rev_starts[..n_defs].to_vec();
3983 for &(src, dst, _w) in &graph.def_edges {
3984 let src_flat = graph.def_offsets[src.0 as usize] + src.1 as usize;
3985 let dst_flat = graph.def_offsets[dst.0 as usize] + dst.1 as usize;
3986 if src_flat < n_defs && dst_flat < n_defs {
3987 let f = fwd_cursor[src_flat] as usize;
3988 fwd_dst[f] = dst_flat as u32;
3989 fwd_cursor[src_flat] += 1;
3990 let r = rev_cursor[dst_flat] as usize;
3991 rev_src[r] = src_flat as u32;
3992 rev_cursor[dst_flat] += 1;
3993 }
3994 }
3995 (fwd_starts, fwd_dst, rev_starts, rev_src)
3996 };
3997
3998 // Helper closures: O(1) bucket lookup into the CSR forward / reverse views.
3999 let fwd_callees = |flat: DefIndex| -> &[u32] {
4000 let lo = fwd_starts[flat] as usize;
4001 let hi = fwd_starts[flat + 1] as usize;
4002 &fwd_dst[lo..hi]
4003 };
4004 let rev_callers = |flat: DefIndex| -> &[u32] {
4005 let lo = rev_starts[flat] as usize;
4006 let hi = rev_starts[flat + 1] as usize;
4007 &rev_src[lo..hi]
4008 };
4009
4010 // Step 2: BFS forward over the CSR forward view from seeds -> reachable set.
4011 let mut reachable: Vec<bool> = vec![false; n_defs];
4012 let mut queue: std::collections::VecDeque<DefIndex> = std::collections::VecDeque::new();
4013
4014 for seed in &seeds {
4015 if *seed < n_defs && !reachable[*seed] {
4016 reachable[*seed] = true;
4017 queue.push_back(*seed);
4018 }
4019 }
4020
4021 while let Some(flat) = queue.pop_front() {
4022 for &callee_flat_u32 in fwd_callees(flat) {
4023 let callee_flat = callee_flat_u32 as usize;
4024 if callee_flat < n_defs && !reachable[callee_flat] {
4025 reachable[callee_flat] = true;
4026 queue.push_back(callee_flat);
4027 }
4028 }
4029 }
4030
4031 // Step 3: dead set = all_defs - reachable.
4032 let dead: Vec<DefIndex> = (0..n_defs).filter(|&i| !reachable[i]).collect();
4033 let total_dead_defs = dead.len();
4034 let total_live_defs = n_defs - total_dead_defs;
4035
4036 if dead.is_empty() {
4037 return DeadCodeReport {
4038 dead_clusters: vec![],
4039 total_dead_defs: 0,
4040 total_live_defs,
4041 dead_fraction: 0.0,
4042 confidence: DeadCodeConfidence::Low,
4043 };
4044 }
4045
4046 // Step 4: connected components on the dead subgraph via union-find.
4047 let dead_set: HashSet<DefIndex> = dead.iter().copied().collect();
4048 let dead_pos: HashMap<DefIndex, usize> = dead
4049 .iter()
4050 .copied()
4051 .enumerate()
4052 .map(|(pos, idx)| (idx, pos))
4053 .collect();
4054 let m = dead.len();
4055 let mut parent: Vec<usize> = (0..m).collect();
4056
4057 // Use the same CSR forward / reverse views built above for the BFS so
4058 // the dead subgraph's connected components reflect every actual call
4059 // edge — not the [`MAX_NEIGHBORS`]-truncated display view (I#57). The CSR
4060 // layout keeps storage O(E) at kernel scale rather than the duplicated
4061 // O(n_defs * avg_fanout) of the pre-I#61 `Vec<Vec<DefIndex>>` (I#61).
4062 for &flat in &dead {
4063 let pos_flat = dead_pos[&flat];
4064 for &callee_flat_u32 in fwd_callees(flat) {
4065 let callee_flat = callee_flat_u32 as usize;
4066 if dead_set.contains(&callee_flat) {
4067 let pos_callee = dead_pos[&callee_flat];
4068 uf_union(&mut parent, pos_flat, pos_callee);
4069 }
4070 }
4071 for &caller_flat_u32 in rev_callers(flat) {
4072 let caller_flat = caller_flat_u32 as usize;
4073 if dead_set.contains(&caller_flat) {
4074 let pos_caller = dead_pos[&caller_flat];
4075 uf_union(&mut parent, pos_flat, pos_caller);
4076 }
4077 }
4078 }
4079
4080 // Flatten roots (path compression for all).
4081 for i in 0..m {
4082 uf_find(&mut parent, i);
4083 }
4084
4085 // Group members by their component root.
4086 let mut components: HashMap<usize, Vec<DefIndex>> = HashMap::new();
4087 for (pos, &flat) in dead.iter().enumerate() {
4088 let root_pos = parent[pos];
4089 components.entry(root_pos).or_default().push(flat);
4090 }
4091
4092 // Step 5: build clusters - root is the highest-rank member.
4093 let mut clusters: Vec<DeadCluster> = components
4094 .into_values()
4095 .map(|members| {
4096 let root_flat = members
4097 .iter()
4098 .copied()
4099 .max_by(|&a, &b| {
4100 let ra = graph.def_ranks.get(a).copied().unwrap_or(0.0);
4101 let rb = graph.def_ranks.get(b).copied().unwrap_or(0.0);
4102 ra.total_cmp(&rb)
4103 })
4104 .unwrap_or(members[0]);
4105
4106 let total_lines: usize = members
4107 .iter()
4108 .copied()
4109 .map(|flat| {
4110 let file_idx = flat_to_file_idx(&graph.def_offsets, flat);
4111 let def_idx = flat - graph.def_offsets[file_idx];
4112 let def = graph.files.get(file_idx).and_then(|f| f.defs.get(def_idx));
4113 def.map(|d| (d.end_line as usize).saturating_sub(d.start_line as usize))
4114 .unwrap_or(0)
4115 })
4116 .sum();
4117
4118 let mut member_def_indices: Vec<usize> = std::iter::once(root_flat)
4119 .chain(members.iter().copied().filter(|&m| m != root_flat))
4120 .collect();
4121 if member_def_indices.len() > 1 {
4122 member_def_indices[1..].sort_unstable();
4123 }
4124
4125 DeadCluster {
4126 root_def_idx: root_flat,
4127 size: member_def_indices.len(),
4128 total_lines,
4129 member_def_indices,
4130 }
4131 })
4132 .collect();
4133
4134 // Step 6: sort by size descending.
4135 clusters.sort_by(|a, b| {
4136 b.size
4137 .cmp(&a.size)
4138 .then(b.root_def_idx.cmp(&a.root_def_idx))
4139 });
4140
4141 #[expect(
4142 clippy::cast_precision_loss,
4143 reason = "def counts fit comfortably in f32"
4144 )]
4145 let dead_fraction = if n_defs > 0 {
4146 total_dead_defs as f32 / n_defs as f32
4147 } else {
4148 0.0
4149 };
4150
4151 DeadCodeReport {
4152 dead_clusters: clusters,
4153 total_dead_defs,
4154 total_live_defs,
4155 dead_fraction,
4156 // Caller must set confidence via compute_confidence() after supplying
4157 // entry-point kind counts. Defaulting to Low is conservative.
4158 confidence: DeadCodeConfidence::Low,
4159 }
4160}
4161
4162impl RepoGraph {
4163 /// Get the `PageRank` score for a specific definition.
4164 #[must_use]
4165 pub fn def_rank(&self, did: DefId) -> f32 {
4166 let flat = self.def_offsets[did.0 as usize] + did.1 as usize;
4167 self.def_ranks.get(flat).copied().unwrap_or(0.0)
4168 }
4169
4170 /// Look up a definition by file path and name. Returns the first match.
4171 #[must_use]
4172 pub fn find_def(&self, file_path: &str, def_name: &str) -> Option<DefId> {
4173 for (file_idx, file) in self.files.iter().enumerate() {
4174 if file.path == file_path {
4175 for (def_idx, def) in file.defs.iter().enumerate() {
4176 if def.name == def_name {
4177 #[expect(clippy::cast_possible_truncation)]
4178 return Some((file_idx as u32, def_idx as u16));
4179 }
4180 }
4181 }
4182 }
4183 None
4184 }
4185
4186 /// Resolve a caller-supplied `focus_file` string to a file index in [`Self::files`].
4187 ///
4188 /// Accepts any of the path forms that ripvec itself emits or accepts:
4189 ///
4190 /// - **Exact stored path** (`device_opt/services/storage.py`) — direct match.
4191 /// - **LSP-shaped path** (`./device_opt/services/storage.py`) — the `./`
4192 /// prefix used by every [`RepoMapLspLocation::file_path`] is stripped
4193 /// before comparison so the documented chaining pattern
4194 /// `get_repo_map(focus_file=hits[0].lsp_location.file_path)` works.
4195 /// - **Strict suffix** (`storage.py`, `services/storage.py`) — match when
4196 /// the previous character in the stored path is `/`. Avoids matching
4197 /// `foo_storage.py` for `storage.py`.
4198 ///
4199 /// Returns [`FocusResolution::Found`] when exactly one file matches,
4200 /// [`FocusResolution::Ambiguous`] when multiple files match (the caller
4201 /// surfaces the candidate list to the user), and [`FocusResolution::NotFound`]
4202 /// when no file matches.
4203 ///
4204 /// # Background
4205 ///
4206 /// Prior to this helper the MCP layer (`crates/ripvec-mcp/src/tools.rs`)
4207 /// did the matching inline with two bugs:
4208 ///
4209 /// 1. **`./` prefix mismatch.** [`RepoMapLspLocation::file_path`] always
4210 /// carries a leading `./` (see [`file_lsp_location`]), but
4211 /// [`FileNode::path`] does not. Passing the LSP location verbatim as
4212 /// `focus_file` matched zero files. The matcher silently returned
4213 /// `focus = None`, producing rank values bit-identical to the unfocused
4214 /// call — the bug originally reported as "I#20 focus_file rebias
4215 /// invisible on Python".
4216 /// 2. **Equal-length false negative.** When the user passed
4217 /// `./device_opt/services/storage.py` and the stored path was
4218 /// `device_opt/services/storage.py`, `exact` was false (the strings
4219 /// differ by two bytes) and `strict_suffix` was false (the focus is
4220 /// longer than the stored path, so `p.len() > focus.len()` fails). The
4221 /// pathology surfaced specifically when the focus was a *full* path
4222 /// with the LSP `./` prefix.
4223 ///
4224 /// Centralising the resolution here gives every caller the same
4225 /// normalization-tolerant semantics and one place to test the contract.
4226 #[must_use]
4227 pub fn resolve_focus_file(&self, focus: &str) -> FocusResolution {
4228 let normalized = normalize_focus_path(focus);
4229 let matches: Vec<usize> = self
4230 .files
4231 .iter()
4232 .enumerate()
4233 .filter_map(|(idx, f)| {
4234 if focus_matches_path(&f.path, normalized) {
4235 Some(idx)
4236 } else {
4237 None
4238 }
4239 })
4240 .collect();
4241 match matches.len() {
4242 0 => FocusResolution::NotFound,
4243 1 => FocusResolution::Found(matches[0]),
4244 _ => FocusResolution::Ambiguous(
4245 matches
4246 .into_iter()
4247 .map(|i| self.files[i].path.clone())
4248 .collect(),
4249 ),
4250 }
4251 }
4252}
4253
4254/// Result of resolving a user-supplied `focus_file` string against a [`RepoGraph`].
4255///
4256/// See [`RepoGraph::resolve_focus_file`] for the resolution semantics and the
4257/// historical bug that motivated the helper.
4258#[derive(Debug, Clone)]
4259pub enum FocusResolution {
4260 /// Exactly one file matched. Carries the file index in [`RepoGraph::files`].
4261 Found(usize),
4262 /// No file matched. The caller treats this as an unfocused call.
4263 NotFound,
4264 /// Two or more files matched. The caller surfaces the candidate list so
4265 /// the user can disambiguate by passing a longer suffix or the full path.
4266 Ambiguous(Vec<String>),
4267}
4268
4269/// Strip the leading `./` prefix from a focus_file path.
4270///
4271/// The `./` form is produced by [`file_lsp_location`] for every
4272/// [`RepoMapLspLocation::file_path`] field on a relative path. Stripping it
4273/// gives a stored-path-shaped value for the suffix matcher to compare
4274/// against [`FileNode::path`] entries (which do not carry the prefix).
4275///
4276/// Absolute paths (`/abs/path/file.py`) are returned unchanged; they will
4277/// fail the suffix match against the relative stored paths, which is the
4278/// correct behavior (the caller meant a different root entirely).
4279fn normalize_focus_path(focus: &str) -> &str {
4280 focus.strip_prefix("./").unwrap_or(focus)
4281}
4282
4283/// Return true when `focus` matches `stored_path` as either an exact path or
4284/// a strict-suffix (must be preceded by `/`). The empty focus does not match.
4285fn focus_matches_path(stored_path: &str, focus: &str) -> bool {
4286 if focus.is_empty() {
4287 return false;
4288 }
4289 if stored_path == focus {
4290 return true;
4291 }
4292 stored_path.len() > focus.len()
4293 && stored_path.ends_with(focus)
4294 && stored_path.as_bytes()[stored_path.len() - focus.len() - 1] == b'/'
4295}
4296
4297/// Build top-N caller and callee lists for each file.
4298///
4299/// Given a list of weighted directed edges `(src, dst, weight)` over `n`
4300/// nodes, returns `(callers[i], callees[i])` for each node `i`, where each
4301/// list contains the top-[`MAX_NEIGHBORS`] adjacent nodes sorted by descending
4302/// edge weight.
4303///
4304/// Exposed as `pub` so that integration tests can construct synthetic
4305/// [`RepoGraph`] instances for unit-testing the JSON rendering without going
4306/// through a full disk walk.
4307#[must_use]
4308pub fn build_neighbor_lists(n: usize, edges: &[(u32, u32, u32)]) -> (Vec<Vec<u32>>, Vec<Vec<u32>>) {
4309 let mut incoming: Vec<Vec<(u32, u32)>> = vec![vec![]; n];
4310 let mut outgoing: Vec<Vec<(u32, u32)>> = vec![vec![]; n];
4311
4312 for &(src, dst, w) in edges {
4313 let (s, d) = (src as usize, dst as usize);
4314 if s < n && d < n {
4315 incoming[d].push((src, w));
4316 outgoing[s].push((dst, w));
4317 }
4318 }
4319
4320 // Sort by weight descending, keep top N
4321 let trim = |lists: &mut [Vec<(u32, u32)>]| -> Vec<Vec<u32>> {
4322 lists
4323 .iter_mut()
4324 .map(|list| {
4325 list.sort_by_key(|b| std::cmp::Reverse(b.1));
4326 list.iter()
4327 .take(MAX_NEIGHBORS)
4328 .map(|(idx, _)| *idx)
4329 .collect()
4330 })
4331 .collect()
4332 };
4333
4334 (trim(&mut incoming), trim(&mut outgoing))
4335}
4336
4337// ── Rendering ────────────────────────────────────────────────────────
4338
4339/// Render a budget-constrained overview of the repository.
4340///
4341/// Files are sorted by `PageRank` (or topic-sensitive rank if `focus` is
4342/// `Some`). Output uses four tiers of decreasing detail:
4343///
4344/// - **Tier 0** (top 10%): full path, rank, callers/callees, signatures with scopes
4345/// - **Tier 1** (next 20%): full path, rank, signatures
4346/// - **Tier 2** (next 40%): full path, rank, definition names and kinds
4347/// - **Tier 3** (bottom 30%): file path only
4348///
4349/// Stops accumulating output when the estimated token count exceeds
4350/// `max_tokens`.
4351#[must_use]
4352pub fn render(graph: &RepoGraph, max_tokens: usize, focus: Option<usize>) -> String {
4353 let n = graph.files.len();
4354 if n == 0 {
4355 return String::new();
4356 }
4357
4358 // Compute ranks (recompute topic-sensitive if focus is given)
4359 let ranks = if focus.is_some() {
4360 pagerank(n, &graph.edges, focus)
4361 } else {
4362 graph.base_ranks.clone()
4363 };
4364
4365 // Sort file indices by rank descending
4366 let mut sorted: Vec<usize> = (0..n).collect();
4367 sorted.sort_by(|&a, &b| ranks[b].total_cmp(&ranks[a]));
4368
4369 let mut output = String::new();
4370 let mut used_tokens = 0;
4371 let max_chars = max_tokens * CHARS_PER_TOKEN;
4372
4373 for (rank_pos, &file_idx) in sorted.iter().enumerate() {
4374 if used_tokens >= max_tokens {
4375 break;
4376 }
4377
4378 let file = &graph.files[file_idx];
4379 let score = ranks[file_idx];
4380 #[expect(clippy::cast_precision_loss, reason = "file counts fit in f32")]
4381 let percentile = (rank_pos as f32) / (n as f32);
4382
4383 let section = if percentile < 0.1 {
4384 render_tier0(graph, file_idx, file, score)
4385 } else if percentile < 0.3 {
4386 render_tier1(file, score)
4387 } else if percentile < 0.7 {
4388 render_tier2(file, score)
4389 } else {
4390 render_tier3(file)
4391 };
4392
4393 let section_chars = section.len();
4394 if used_tokens > 0 && used_tokens + section_chars / CHARS_PER_TOKEN > max_tokens {
4395 // Would exceed budget — try to fit at least the path
4396 let path_line = format!("{}\n", file.path);
4397 let path_tokens = path_line.len() / CHARS_PER_TOKEN;
4398 if used_tokens + path_tokens <= max_tokens {
4399 output.push_str(&path_line);
4400 }
4401 break;
4402 }
4403
4404 output.push_str(§ion);
4405 used_tokens = output.len().min(max_chars) / CHARS_PER_TOKEN;
4406 }
4407
4408 output
4409}
4410
4411/// Render tier 0: full detail with callers, callees, and signatures.
4412fn render_tier0(graph: &RepoGraph, file_idx: usize, file: &FileNode, score: f32) -> String {
4413 let mut out = format!("## {} (rank: {score:.4})\n", file.path);
4414
4415 // Callers
4416 if file_idx < graph.callers.len() && !graph.callers[file_idx].is_empty() {
4417 let _ = write!(out, " called by: ");
4418 let names: Vec<&str> = graph.callers[file_idx]
4419 .iter()
4420 .filter_map(|&idx| graph.files.get(idx as usize).map(|f| f.path.as_str()))
4421 .collect();
4422 let _ = writeln!(out, "{}", names.join(", "));
4423 }
4424
4425 // Callees
4426 if file_idx < graph.callees.len() && !graph.callees[file_idx].is_empty() {
4427 let _ = write!(out, " calls: ");
4428 let names: Vec<&str> = graph.callees[file_idx]
4429 .iter()
4430 .filter_map(|&idx| graph.files.get(idx as usize).map(|f| f.path.as_str()))
4431 .collect();
4432 let _ = writeln!(out, "{}", names.join(", "));
4433 }
4434
4435 // Definitions with scope and signature
4436 for def in &file.defs {
4437 let scope_prefix = if def.scope.is_empty() {
4438 String::new()
4439 } else {
4440 format!("{} > ", def.scope)
4441 };
4442 if let Some(sig) = &def.signature {
4443 let _ = writeln!(out, " {scope_prefix}{} {sig}", def.kind);
4444 } else {
4445 let _ = writeln!(out, " {scope_prefix}{} {}", def.kind, def.name);
4446 }
4447 }
4448 let _ = writeln!(out);
4449 out
4450}
4451
4452/// Render tier 1: file path, rank, and signatures.
4453fn render_tier1(file: &FileNode, score: f32) -> String {
4454 let mut out = format!("## {} (rank: {score:.4})\n", file.path);
4455 for def in &file.defs {
4456 if let Some(sig) = &def.signature {
4457 let _ = writeln!(out, " {sig}");
4458 } else {
4459 let _ = writeln!(out, " {} {}", def.kind, def.name);
4460 }
4461 }
4462 let _ = writeln!(out);
4463 out
4464}
4465
4466/// Render tier 2: file path, rank, and definition names/kinds.
4467fn render_tier2(file: &FileNode, score: f32) -> String {
4468 let mut out = format!("{} (rank: {score:.4})", file.path);
4469 if !file.defs.is_empty() {
4470 let names: Vec<String> = file
4471 .defs
4472 .iter()
4473 .map(|d| format!("{}:{}", d.kind, d.name))
4474 .collect();
4475 let _ = write!(out, " -- {}", names.join(", "));
4476 }
4477 let _ = writeln!(out);
4478 out
4479}
4480
4481/// Render tier 3: file path only.
4482fn render_tier3(file: &FileNode) -> String {
4483 format!("{}\n", file.path)
4484}
4485
4486// ── JSON rendering ───────────────────────────────────────────────────
4487
4488/// Build the `lsp_location` for a file itself (line 0).
4489fn file_lsp_location(path: &str) -> RepoMapLspLocation {
4490 RepoMapLspLocation {
4491 file_path: if path.starts_with("./") || path.starts_with('/') {
4492 path.to_string()
4493 } else {
4494 format!("./{path}")
4495 },
4496 start_line: 0,
4497 start_character: 0,
4498 end_line: 0,
4499 end_character: 0,
4500 }
4501}
4502
4503/// Infer `ContentKind` from a file path's extension.
4504fn content_kind_for_path(path: &str) -> ContentKind {
4505 let ext = std::path::Path::new(path)
4506 .extension()
4507 .and_then(|e| e.to_str())
4508 .unwrap_or("");
4509 ContentKind::from_extension(ext)
4510}
4511
4512/// Minimum byte envelope reserved for each included file.
4513///
4514/// Even a file with zero symbols takes JSON overhead for path, rank, arrays,
4515/// etc. Calibrated against actual serde_json output for an empty `RepoMapFile`:
4516/// `{"lsp_location":{"file_path":"./src/file_N.rs","start_line":0,"start_character":0,`
4517/// `"end_line":0,"end_character":0},"rank":0.1234,"content_kind":"code",`
4518/// `"calls":[],"symbols":[],"truncated_symbols":0,"truncated_calls":0}` ≈ 250 bytes.
4519///
4520/// This floor prevents the budget allocator from giving a file so little space
4521/// that it can emit no envelope at all.
4522const FILE_ENVELOPE_MIN_BYTES: usize = 250;
4523
4524/// Minimum useful payload for an admitted file: envelope plus room for at
4525/// least 2-3 typical-sized symbols. Files whose fair share cannot meet this
4526/// floor are excluded entirely (Fix A, 4.0.2). Without this guard, low-rank
4527/// tail files consume budget on envelopes that contain no symbols or calls,
4528/// crowding out content for the top files.
4529const FILE_MIN_USEFUL_BYTES: usize = 600;
4530
4531/// Fraction of each file's per-file budget reserved for outgoing-call edges
4532/// after the envelope is paid. The remaining (1 - this) fraction goes to
4533/// symbols. Symbol leftover flows into calls; call leftover flows to the
4534/// next file. (Fix C, 4.0.2 — without a reserve, the symbol loop saturates
4535/// the per-file budget and calls always come up empty.)
4536///
4537/// This fraction is the FLOOR: if the stored callees' exact byte cost exceeds
4538/// `post_envelope * CALLS_BUDGET_FRACTION`, the calls reserve is raised to the
4539/// exact cost (capped at `CALLS_MAX_FRACTION`). See B-0059 / 4.1.10.
4540const CALLS_BUDGET_FRACTION: f64 = 0.30;
4541
4542/// Maximum fraction of `post_envelope` that may be reserved for calls.
4543///
4544/// Prevents calls from starving symbols entirely when there are many callees
4545/// with long paths. `CALLS_BUDGET_FRACTION` sets a floor; this sets a ceiling.
4546/// Together they implement the precise-reserve strategy (B-0059 / 4.1.10):
4547/// the reserve is `clamp(exact_calls_bytes, fraction_floor, fraction_ceil)`.
4548const CALLS_MAX_FRACTION: f64 = 0.70;
4549
4550/// Maximum fraction of the total budget that a single file may claim.
4551///
4552/// Without this cap a single very-high-rank file (e.g. `lib.rs`) could
4553/// consume the entire budget, leaving all other files empty.
4554const MAX_FILE_SHARE: f64 = 0.40;
4555
4556/// AST kind priority for orientation-style symbol ordering. Higher = surface
4557/// earlier. Used when def-level PageRank is degenerate (most ranks near zero)
4558/// to fall back on structural signal rather than noise.
4559///
4560/// The intuition: a reader orienting in a codebase wants to see the file's
4561/// *shape* before its *behaviors*. Types declare shape; functions declare
4562/// behavior; fields and constants are internal detail. This ordering matches
4563/// how humans read code top-down. (Fix B, 4.0.2.)
4564fn ast_kind_priority(kind: &str) -> u32 {
4565 match kind {
4566 // Tier 3: shape — what THIS file is
4567 "trait_item" | "interface" | "trait" => 30,
4568 "struct_item" | "struct" | "class_definition" | "class" => 29,
4569 "enum_item" | "enum" => 28,
4570 "type_item" | "type_alias_declaration" | "type_alias" => 27,
4571 "mod_item" | "module" | "namespace" => 26,
4572 // Tier 2: behavior — what THIS file does
4573 "function_item" | "function_definition" | "function" | "method_definition" => 20,
4574 "impl_item" | "impl" => 19,
4575 // Tier 1: declarations
4576 "const_item" | "const_declaration" | "const" => 10,
4577 "static_item" | "static" => 9,
4578 // Tier 0: internals (fields, variables, parameters)
4579 _ => 0,
4580 }
4581}
4582
4583/// Effective AST priority with corpus-relative rank promotion (4.0.4).
4584///
4585/// Preserves the 4.0.2 AST-priority ordering by default (types first,
4586/// then functions, then fields). When a def's PageRank significantly
4587/// exceeds the corpus median, promotes it up one or two tiers so that
4588/// load-bearing defs surface alongside their declared-tier neighbors.
4589///
4590/// Thresholds are corpus-median multiples (self-calibrating):
4591/// - rank > 4× median → +1 tier (e.g., hot function joins type tier)
4592/// - rank > 16× median → +2 tiers (extremely hot def)
4593/// - otherwise → declared tier preserved
4594///
4595/// On degenerate (flat) rank distributions the median equals the floor,
4596/// nothing crosses threshold, and 4.0.2 AST-priority ordering is fully
4597/// preserved. On informative distributions (post-4.0.3 enrichment),
4598/// hot defs surface proportionally.
4599fn effective_priority(kind: &str, def_rank: f32, promo_1: f32, promo_2: f32) -> u32 {
4600 let base = ast_kind_priority(kind);
4601 // Accumulate promotion tiers as a plain integer to satisfy clippy's
4602 // bool_to_int_with_if lint while preserving branch clarity.
4603 let promo_tiers: u32 = u32::from(def_rank > promo_1) + u32::from(def_rank > promo_2);
4604 // Tier spacing matches ast_kind_priority's 10-unit gaps.
4605 base + promo_tiers * 10
4606}
4607
4608/// Estimate the serialised JSON byte cost of one `RepoMapSymbol`.
4609///
4610/// Calibrated against actual serde_json output. A `RepoMapSymbol` serialises to
4611/// approximately:
4612/// `{"name":"<N>","kind":<K>,"lsp_location":{"file_path":"<P>","start_line":0,`
4613/// `"start_character":0,"end_line":0,"end_character":0},"rank":<R>}`
4614///
4615/// That is ~165 bytes of overhead (braces, keys, fixed-width integers, rank)
4616/// plus the name length and file_path length. We pass the path length
4617/// separately because the path is the same for all symbols in one file.
4618fn estimate_symbol_bytes(name: &str) -> usize {
4619 // 165 bytes overhead + name length.
4620 // The file_path is not included here because it is part of the
4621 // envelope cost accounted separately.
4622 165 + name.len()
4623}
4624
4625/// Estimate the serialised JSON byte cost of one `RepoMapCall`.
4626///
4627/// Each call entry: `{"lsp_location":{"file_path":"<P>","start_line":0,`
4628/// `"start_character":0,"end_line":0,"end_character":0},"rank":<R>}`
4629/// ≈ 120 bytes overhead + path length.
4630fn estimate_call_bytes(target_path: &str) -> usize {
4631 120 + target_path.len()
4632}
4633
4634/// Render a `PageRank`-weighted JSON map with token-budget allocation (4.0.1).
4635///
4636/// # Algorithm
4637///
4638/// **Step 1 — File-share allocation.** Each eligible file receives a byte
4639/// budget proportional to its `base_rank`. The share is capped at 40% of
4640/// `budget_total_bytes` and floored at [`FILE_ENVELOPE_MIN_BYTES`] (200 B).
4641/// Files are included in rank order until the cumulative allocation would
4642/// exceed the total budget.
4643///
4644/// **Step 2 — Per-file symbol fill.** For each included file, symbols are
4645/// walked in def-rank descending order. Inclusion continues until either (a)
4646/// the file's budget share is exhausted (with carry-over of leftover bytes to
4647/// the next file) or (b) a logarithmic attenuation cutoff fires: symbol at
4648/// position `i` (0-based) is included only if its rank ≥ `top_rank /
4649/// (1 + ln(i + 1))`. `calls[]` is filled in target-file base-rank order
4650/// subject to a hard [`MAX_FILE_CALLS`] render cap (no attenuation — I#68,
4651/// 4.1.4) and per-file byte budget. `truncated_symbols` and `truncated_calls`
4652/// track the count of omitted entries.
4653///
4654/// **Step 3 — Response telemetry.** The response includes `estimated_bytes`
4655/// (actual returned content size), `budget_bytes` (`token_budget * 4`),
4656/// and `budget_exhausted` (`total_files > files.len()`).
4657///
4658/// # Arguments
4659///
4660/// - `graph` — the built dependency graph.
4661/// - `token_budget` — caller-specified token budget (× 4 = byte budget).
4662/// - `focus` — optional file index for topic-sensitive `PageRank`.
4663/// - `include_metadata` — when `false` (default), Meta-classified files
4664/// are excluded before ranking.
4665#[must_use]
4666#[expect(
4667 clippy::cast_precision_loss,
4668 reason = "rank sums and counts are small f32/f64; precision loss is acceptable"
4669)]
4670#[expect(
4671 clippy::too_many_lines,
4672 reason = "the three-step allocation algorithm (file-share → symbol-fill → calls-fill) \
4673 is sequential and share state; splitting into helpers would require passing \
4674 mutable slices across three boundaries with no clarity gain"
4675)]
4676pub fn render_json_budgeted(
4677 graph: &RepoGraph,
4678 token_budget: usize,
4679 focus: Option<usize>,
4680 include_metadata: bool,
4681) -> GetRepoMapResponse {
4682 let n = graph.files.len();
4683 if n == 0 {
4684 let budget_bytes = token_budget * CHARS_PER_TOKEN;
4685 return GetRepoMapResponse {
4686 files: vec![],
4687 total_files: 0,
4688 estimated_bytes: 0,
4689 budget_bytes,
4690 budget_exhausted: false,
4691 capped: false,
4692 };
4693 }
4694
4695 let budget_total_bytes = token_budget * CHARS_PER_TOKEN;
4696
4697 // Recompute topic-sensitive ranks if focus is given.
4698 let ranks = if focus.is_some() {
4699 pagerank(n, &graph.edges, focus)
4700 } else {
4701 graph.base_ranks.clone()
4702 };
4703
4704 // Sort all file indices by rank descending.
4705 let mut sorted: Vec<usize> = (0..n).collect();
4706 sorted.sort_by(|&a, &b| ranks[b].total_cmp(&ranks[a]));
4707
4708 // Apply metadata exclusion filter.
4709 let eligible: Vec<usize> = if include_metadata {
4710 sorted
4711 } else {
4712 sorted
4713 .into_iter()
4714 .filter(|&idx| {
4715 let kind = content_kind_for_path(&graph.files[idx].path);
4716 kind != ContentKind::Meta
4717 })
4718 .collect()
4719 };
4720
4721 let total_files = eligible.len();
4722
4723 // ── Corpus-median def-rank thresholds for tier promotion (4.0.4) ────────
4724 //
4725 // Compute once per call (corpus-wide, not per-file) so the threshold is
4726 // self-calibrating: flat distributions (all ranks equal) set median = floor
4727 // and nothing crosses threshold; informative distributions see proportional
4728 // promotion. Using corpus-wide median ensures a hot function in one file is
4729 // judged against the entire corpus, not just its local file peers.
4730 // Use the 75th percentile of nonzero def-ranks as the corpus reference value
4731 // for tier promotion (rather than the 50th percentile / median). The 75th
4732 // percentile is more robust: on a flat distribution most defs cluster near the
4733 // floor, so the 75th percentile is only marginally above the floor (making the
4734 // 4× threshold very selective). On an informative distribution (post-4.0.3
4735 // call-edge enrichment) the 75th percentile is meaningfully above the floor,
4736 // so the same 4× multiplier captures genuinely hot defs without falsely
4737 // promoting slightly-above-floor helpers.
4738 //
4739 // The 50th percentile (lower median) was rejected because on a 10-def corpus
4740 // with max/min ratio 5× the median equals the floor, causing the 4× threshold
4741 // to fire on defs that are only 5× above floor (a low-variance corpus). The
4742 // 75th percentile corrects this without requiring hand-tuned per-corpus magic
4743 // numbers.
4744 let corpus_reference_rank: f32 = {
4745 let mut nonzero: Vec<f32> = graph
4746 .def_ranks
4747 .iter()
4748 .copied()
4749 .filter(|r| *r > 0.0)
4750 .collect();
4751 if nonzero.is_empty() {
4752 0.0
4753 } else {
4754 nonzero.sort_unstable_by(f32::total_cmp);
4755 let n = nonzero.len();
4756 // 75th percentile index: floor(0.75 * (n - 1))
4757 let idx = (3 * (n - 1)) / 4;
4758 nonzero[idx]
4759 }
4760 };
4761 let promo_1_threshold = corpus_reference_rank * 4.0; // +1 tier
4762 let promo_2_threshold = corpus_reference_rank * 16.0; // +2 tiers
4763
4764 // ── Step 1: File-share allocation ────────────────────────────────
4765
4766 // Greedily determine which files fit within the budget, computing each
4767 // file's share as it is added. We must run a two-pass approach:
4768 // pass A: determine which files are included (cumulative sum check),
4769 // pass B: fill symbols/calls using final per-file allocations.
4770 //
4771 // The "included" decision is based on the running cumulative sum so that
4772 // the leftover redistribution in step 2 can carry forward correctly.
4773
4774 // Floor-first admission (Fix A, 4.0.2):
4775 //
4776 // Cap admitted file count so each gets at least FILE_MIN_USEFUL_BYTES.
4777 // Below this threshold the response would carry envelopes that contain
4778 // no symbols or calls — pure overhead, no information. Concentrating
4779 // the budget on fewer files with real content is strictly better for
4780 // orientation than dropping envelope sentinels for many files.
4781 let max_admissible = budget_total_bytes / FILE_MIN_USEFUL_BYTES;
4782 let admit_count = eligible.len().min(max_admissible.max(1));
4783
4784 let budget_f64 = budget_total_bytes as f64;
4785
4786 // Pre-compute rank sum across ADMITTED files only (top-N by rank). f64
4787 // to avoid precision loss when summing many small f32 values.
4788 let admitted_rank_sum: f64 = eligible
4789 .iter()
4790 .take(admit_count)
4791 .map(|&idx| f64::from(ranks[idx]))
4792 .sum();
4793 let admitted_rank_sum = if admitted_rank_sum > 0.0 {
4794 admitted_rank_sum
4795 } else {
4796 1.0
4797 };
4798
4799 // Compute per-file budgets. Each admitted file gets at least
4800 // FILE_MIN_USEFUL_BYTES; the proportional-to-rank share is applied on
4801 // top of the floor and capped at MAX_FILE_SHARE.
4802 let mut included_indices: Vec<usize> = Vec::new(); // indices into `eligible`
4803 let mut file_budgets: Vec<usize> = Vec::new();
4804 let mut cumulative_budget: usize = 0;
4805
4806 for (i, &file_idx) in eligible.iter().take(admit_count).enumerate() {
4807 let file_rank = f64::from(ranks[file_idx]);
4808 let raw_share = budget_f64 * file_rank / admitted_rank_sum;
4809 let capped = raw_share.min(budget_f64 * MAX_FILE_SHARE);
4810 // `capped` is non-negative and bounded by budget_f64 (a usize).
4811 #[expect(
4812 clippy::cast_possible_truncation,
4813 clippy::cast_sign_loss,
4814 reason = "capped is non-negative and bounded by budget_total_bytes (a usize)"
4815 )]
4816 let budget_i = (capped as usize).max(FILE_MIN_USEFUL_BYTES);
4817
4818 if cumulative_budget + budget_i > budget_total_bytes && !included_indices.is_empty() {
4819 break;
4820 }
4821 cumulative_budget += budget_i;
4822 included_indices.push(i);
4823 file_budgets.push(budget_i);
4824 }
4825
4826 // ── Step 2: Per-file symbol fill ─────────────────────────────────
4827
4828 let mut result_files: Vec<RepoMapFile> = Vec::with_capacity(included_indices.len());
4829 let mut leftover: usize = 0; // unused bytes carried from previous file
4830
4831 for (slot, &eligible_i) in included_indices.iter().enumerate() {
4832 let file_idx = eligible[eligible_i];
4833 let file = &graph.files[file_idx];
4834 let file_rank = ranks[file_idx];
4835 let file_path_lsp = file_lsp_location(&file.path);
4836
4837 let budget_in = file_budgets[slot] + leftover;
4838
4839 // Pre-compute callee indices here (before the budget split) so the
4840 // precise byte cost of rendering all stored callees is available when
4841 // setting calls_reserve. This is the B-0059 fix (4.1.10): the old code
4842 // placed callee_indices after the symbol loop, so calls_reserve was
4843 // always the fraction-floor — even when the stored callees needed more.
4844 let callee_indices: Vec<usize> = if file_idx < graph.callees.len() {
4845 let mut callees: Vec<(usize, f32)> = graph.callees[file_idx]
4846 .iter()
4847 .filter_map(|&ci| {
4848 let ci = ci as usize;
4849 graph.files.get(ci).map(|_| {
4850 let r = graph.base_ranks.get(ci).copied().unwrap_or(0.0);
4851 (ci, r)
4852 })
4853 })
4854 .collect();
4855 callees.sort_unstable_by(|a, b| b.1.total_cmp(&a.1));
4856 callees.into_iter().map(|(ci, _)| ci).collect()
4857 } else {
4858 vec![]
4859 };
4860
4861 // Reserve a fraction of the post-envelope budget for outgoing calls
4862 // (Fix C, 4.0.2). Without this guard the symbol loop saturates
4863 // `budget_in` and the calls loop always trips its byte-check.
4864 // Symbol leftover flows into calls; call leftover flows to the
4865 // next file via the outer `leftover` variable.
4866 //
4867 // B-0059 (4.1.10): raise the reserve to the exact byte cost of
4868 // rendering all stored callees (up to MAX_FILE_CALLS), clamped to
4869 // [fraction_floor, CALLS_MAX_FRACTION * post_envelope]. This ensures
4870 // that at any budget where calls physically fit, they are not crowded
4871 // out by symbols consuming the fraction-floor reserve.
4872 let post_envelope = budget_in.saturating_sub(FILE_ENVELOPE_MIN_BYTES);
4873 let precise_calls_bytes: usize = callee_indices
4874 .iter()
4875 .take(MAX_FILE_CALLS)
4876 .map(|&ci| estimate_call_bytes(&graph.files[ci].path))
4877 .sum();
4878 #[expect(
4879 clippy::cast_possible_truncation,
4880 clippy::cast_sign_loss,
4881 reason = "post_envelope * fraction is bounded by post_envelope (a usize); result is non-negative"
4882 )]
4883 let fraction_floor = (post_envelope as f64 * CALLS_BUDGET_FRACTION) as usize;
4884 #[expect(
4885 clippy::cast_possible_truncation,
4886 clippy::cast_sign_loss,
4887 reason = "post_envelope * fraction is bounded by post_envelope (a usize); result is non-negative"
4888 )]
4889 let fraction_ceil = (post_envelope as f64 * CALLS_MAX_FRACTION) as usize;
4890 let calls_reserve = precise_calls_bytes.max(fraction_floor).min(fraction_ceil);
4891 let symbols_budget = FILE_ENVELOPE_MIN_BYTES + post_envelope.saturating_sub(calls_reserve);
4892 let mut used: usize = FILE_ENVELOPE_MIN_BYTES; // envelope cost
4893
4894 // ── Symbols ──────────────────────────────────────────────────
4895 // Retrieve def-level ranks for this file via the offset table.
4896 let def_count = file.defs.len();
4897 let def_offset = if file_idx < graph.def_offsets.len() {
4898 graph.def_offsets[file_idx]
4899 } else {
4900 0
4901 };
4902
4903 // Build (def_idx, rank, kind_priority, start_byte) tuples. We sort
4904 // by a composite key: AST kind priority (descending) — putting types
4905 // before functions before fields — then by def_rank (descending)
4906 // within each tier. This is Fix B (4.0.2): the def_rank distribution
4907 // is often degenerate (most defs share near-zero rank because the
4908 // call-edge extractor doesn't capture every dispatch), so we use
4909 // structural signal as the primary ordering and def_rank as the
4910 // within-tier tiebreaker. When def_rank IS informative, it dominates
4911 // *within* its kind tier and recovers the original behavior; the AST
4912 // signal only shifts ordering *between* tiers.
4913 let mut def_rank_pairs: Vec<(usize, f32, u32, u32)> = (0..def_count)
4914 .map(|di| {
4915 let flat = def_offset + di;
4916 let r = graph.def_ranks.get(flat).copied().unwrap_or(0.0);
4917 // Store the ORIGINAL ast_kind_priority in the tuple (used by the
4918 // per-tier attenuation loop below). The sort comparator uses
4919 // effective_priority (which may be higher due to 4.0.4 promotion)
4920 // to reorder hot defs ahead of cold type-tier defs, while the
4921 // attenuation tier tracker continues to use the original AST tier
4922 // so the existing per-tier cutoff behaviour is preserved.
4923 let kind_prio = ast_kind_priority(&file.defs[di].kind);
4924 let decl_order = file.defs[di].start_byte;
4925 (di, r, kind_prio, decl_order)
4926 })
4927 .collect();
4928 def_rank_pairs.sort_unstable_by(|a, b| {
4929 // Primary: effective priority (4.0.4: AST kind + corpus-rank promotion) descending.
4930 // Hot defs that exceed corpus-median thresholds are promoted above their
4931 // declared tier so they surface before cold type-tier defs.
4932 let eff_a = effective_priority(
4933 &file.defs[a.0].kind,
4934 a.1,
4935 promo_1_threshold,
4936 promo_2_threshold,
4937 );
4938 let eff_b = effective_priority(
4939 &file.defs[b.0].kind,
4940 b.1,
4941 promo_1_threshold,
4942 promo_2_threshold,
4943 );
4944 eff_b
4945 .cmp(&eff_a)
4946 // Secondary: def_rank descending within tier.
4947 .then_with(|| b.1.total_cmp(&a.1))
4948 // Tertiary: earlier declaration order (stable, deterministic).
4949 .then_with(|| a.3.cmp(&b.3))
4950 });
4951
4952 let top_def_rank = def_rank_pairs.first().map(|&(_, r, _, _)| r).unwrap_or(0.0);
4953
4954 let mut symbols: Vec<RepoMapSymbol> = Vec::new();
4955 let mut truncated_symbols: usize = 0;
4956
4957 // Track per-tier position for the attenuation cutoff. When AST kind
4958 // priority changes (we've moved from types to functions, say), reset
4959 // the position so the attenuation curve restarts. Otherwise a
4960 // structurally-equivalent-but-later tier would be unfairly cut.
4961 let mut tier_pos: usize = 0;
4962 let mut current_tier: Option<u32> = None;
4963 let mut tier_top_rank: f32 = top_def_rank;
4964
4965 for (pos, &(di, def_r, kind_prio, _)) in def_rank_pairs.iter().enumerate() {
4966 // Reset attenuation at tier boundaries.
4967 if current_tier != Some(kind_prio) {
4968 current_tier = Some(kind_prio);
4969 tier_pos = 0;
4970 tier_top_rank = def_r;
4971 }
4972
4973 // Logarithmic attenuation cutoff, relative to the tier's top rank.
4974 let cutoff = if tier_top_rank > 0.0 {
4975 tier_top_rank / (1.0 + (tier_pos as f32 + 1.0).ln())
4976 } else {
4977 0.0
4978 };
4979 if def_r < cutoff {
4980 // Attenuation cuts the rest of THIS tier; we don't stop
4981 // entirely because the next tier may still have useful
4982 // content within its own attenuation curve. Skip this def.
4983 truncated_symbols += 1;
4984 tier_pos += 1;
4985 continue;
4986 }
4987
4988 let def = &file.defs[di];
4989 let sym_bytes = estimate_symbol_bytes(&def.name);
4990 // Use the reserved symbols sub-budget (Fix C) so calls aren't
4991 // starved when symbols would otherwise saturate budget_in.
4992 if used + sym_bytes > symbols_budget {
4993 truncated_symbols += def_rank_pairs.len() - pos;
4994 break;
4995 }
4996
4997 // C2 (4.1.1): Use the AST-computed lsp_kind_hint when available
4998 // (populated at parse time for Python decorated_definition nodes).
4999 // Fall back to the AST-less string mapping for all other kinds.
5000 let kind = def
5001 .lsp_kind_hint
5002 .unwrap_or_else(|| crate::languages::lsp_symbol_kind_for_node_kind(&def.kind));
5003 let line_0 = def.start_line.saturating_sub(1) as usize;
5004 symbols.push(RepoMapSymbol {
5005 name: def.name.clone(),
5006 kind,
5007 lsp_location: RepoMapLspLocation {
5008 file_path: file_path_lsp.file_path.clone(),
5009 start_line: line_0,
5010 start_character: 0,
5011 end_line: line_0,
5012 end_character: 0,
5013 },
5014 rank: def_r,
5015 });
5016 used += sym_bytes;
5017 tier_pos += 1;
5018 }
5019
5020 // ── Calls ─────────────────────────────────────────────────────
5021 // `callee_indices` was pre-computed above (before the budget split) for
5022 // the precise-reserve calculation. Re-use it here.
5023 let call_total = callee_indices.len();
5024
5025 let mut calls: Vec<RepoMapCall> = Vec::new();
5026 let mut truncated_calls: usize = 0;
5027
5028 for (pos, &ci) in callee_indices.iter().enumerate() {
5029 let callee_rank = graph.base_ranks.get(ci).copied().unwrap_or(0.0);
5030
5031 // Hard render-time cap (I#68, 4.1.4): stop once we have rendered
5032 // MAX_FILE_CALLS entries. This is symmetric with the MAX_NEIGHBORS
5033 // graph-build cap (I#60) and replaces the old logarithmic attenuation
5034 // cutoff. Attenuation was correct for *symbols* (rank distributions
5035 // are informative) but pathological for *call edges*: in real corpora
5036 // callee base-ranks follow a geometric distribution, causing attenuation
5037 // to fire at pos=1 and collapse calls[] to a single entry.
5038 if calls.len() >= MAX_FILE_CALLS {
5039 truncated_calls += call_total - pos;
5040 break;
5041 }
5042
5043 let callee_path = &graph.files[ci].path;
5044 let call_bytes = estimate_call_bytes(callee_path);
5045 if used + call_bytes > budget_in {
5046 truncated_calls += call_total - pos;
5047 break;
5048 }
5049
5050 calls.push(RepoMapCall {
5051 lsp_location: file_lsp_location(callee_path),
5052 rank: callee_rank,
5053 });
5054 used += call_bytes;
5055 }
5056
5057 // Carry unused bytes forward to the next file.
5058 leftover = budget_in.saturating_sub(used);
5059
5060 result_files.push(RepoMapFile {
5061 lsp_location: file_path_lsp,
5062 rank: file_rank,
5063 content_kind: content_kind_tag(content_kind_for_path(&file.path)),
5064 calls,
5065 symbols,
5066 truncated_symbols,
5067 truncated_calls,
5068 });
5069 }
5070
5071 let estimated_bytes = serde_json::to_string(&result_files)
5072 .map(|s| s.len())
5073 .unwrap_or(0);
5074
5075 let budget_exhausted = total_files > result_files.len();
5076
5077 GetRepoMapResponse {
5078 files: result_files,
5079 total_files,
5080 estimated_bytes,
5081 budget_bytes: budget_total_bytes,
5082 budget_exhausted,
5083 capped: budget_exhausted,
5084 }
5085}
5086
5087/// Render a `PageRank`-sorted JSON map of the repository (4.0.0 compatibility shim).
5088///
5089/// This function wraps [`render_json_budgeted`] with a synthetic token budget
5090/// derived from `max_files * 2000` (a generous per-file allowance). It exists
5091/// to keep the existing D1/D2 unit tests compiling without change; the MCP
5092/// layer calls [`render_json_budgeted`] directly in 4.0.1.
5093///
5094/// The `capped` field in the response reflects whether the budget was
5095/// exhausted before all `eligible` files were included, which is equivalent
5096/// to the previous `total_files > max_files` check.
5097///
5098/// When `include_metadata` is `false` (default), files whose extension
5099/// classifies as [`ContentKind::Meta`] are excluded before ranking.
5100#[must_use]
5101pub fn render_json(
5102 graph: &RepoGraph,
5103 max_files: usize,
5104 focus: Option<usize>,
5105 include_metadata: bool,
5106) -> GetRepoMapResponse {
5107 // Synthesise a generous token budget: 2000 tokens per requested file.
5108 // This ensures the existing D1/D2 tests (which pass small max_files values
5109 // like 3, 5, 50) see the same file-count behaviour they expect. The test
5110 // assertions check file counts, not byte sizes, so the exact budget value
5111 // only matters for ensuring enough headroom.
5112 let token_budget = max_files.saturating_mul(2000);
5113 render_json_budgeted(graph, token_budget, focus, include_metadata)
5114}
5115
5116// ── Tests ────────────────────────────────────────────────────────────
5117
5118#[cfg(test)]
5119mod tests {
5120 use super::*;
5121
5122 #[test]
5123 fn test_pagerank_simple() {
5124 // 3-node graph: 0 -> 1 -> 2, 2 -> 0 (cycle)
5125 let edges = vec![(0, 1, 1), (1, 2, 1), (2, 0, 1)];
5126 let ranks = pagerank(3, &edges, None);
5127
5128 // All nodes in a symmetric cycle should have equal rank
5129 assert_eq!(ranks.len(), 3);
5130 let sum: f32 = ranks.iter().sum();
5131 assert!(
5132 (sum - 1.0).abs() < 0.01,
5133 "ranks should sum to ~1.0, got {sum}"
5134 );
5135
5136 // In a perfect cycle, all ranks should be approximately equal
5137 let expected = 1.0 / 3.0;
5138 for (i, &r) in ranks.iter().enumerate() {
5139 assert!(
5140 (r - expected).abs() < 0.05,
5141 "rank[{i}] = {r}, expected ~{expected}"
5142 );
5143 }
5144 }
5145
5146 #[test]
5147 fn test_pagerank_star() {
5148 // Star graph: 0,1,2 all point to 3
5149 let edges = vec![(0, 3, 1), (1, 3, 1), (2, 3, 1)];
5150 let ranks = pagerank(4, &edges, None);
5151
5152 assert_eq!(ranks.len(), 4);
5153 // Node 3 should have the highest rank
5154 let max_idx = ranks
5155 .iter()
5156 .enumerate()
5157 .max_by(|a, b| a.1.total_cmp(b.1))
5158 .unwrap()
5159 .0;
5160 assert_eq!(max_idx, 3, "node 3 should have highest rank");
5161 assert!(
5162 ranks[3] > ranks[0],
5163 "rank[3]={} should be > rank[0]={}",
5164 ranks[3],
5165 ranks[0]
5166 );
5167 }
5168
5169 #[test]
5170 fn test_pagerank_topic_sensitive() {
5171 // 10-node chain: 0 -> 1 -> ... -> 9.
5172 //
5173 // With PERSONALIZATION_ALPHA = 0.15 and n = 10, the uniform share per
5174 // node is 1/10 = 0.10. The focus node (0) gets 0.15 teleportation
5175 // mass vs 0.10 uniform, so focused rank[0] > uniform rank[0] holds.
5176 //
5177 // The 3-node chain used previously broke when alpha was reduced from
5178 // 0.70 to 0.15 because 0.15 < 1/3 = 0.33 for n=3 — the focus node
5179 // received *less* teleportation than its uniform share, inverting the
5180 // expected direction. Using n=10 avoids this edge case while still
5181 // testing the personalization effect.
5182 let n = 10_usize;
5183 #[expect(clippy::cast_possible_truncation, reason = "test: n << u32::MAX")]
5184 let edges: Vec<(u32, u32, u32)> = (0..(n - 1))
5185 .map(|i| (i as u32, (i + 1) as u32, 1_u32))
5186 .collect();
5187 let uniform_ranks = pagerank(n, &edges, None);
5188 let biased_ranks = pagerank(n, &edges, Some(0));
5189
5190 // With focus on node 0, it should get a higher rank than uniform
5191 // because PERSONALIZATION_ALPHA (0.15) > 1/n (0.10) for n=10.
5192 assert!(
5193 biased_ranks[0] > uniform_ranks[0],
5194 "focused rank[0]={} should be > uniform rank[0]={}",
5195 biased_ranks[0],
5196 uniform_ranks[0]
5197 );
5198 }
5199
5200 // ── J1 tests — topic-sensitive PageRank soft personalization ─────────
5201
5202 /// J1 RED: `focus_file` PageRank must not collapse other-file ranks.
5203 ///
5204 /// Baseline (pre-4.0.5) concentrated 70% mass on the focus node, producing
5205 /// a degenerate Dirac delta: focus rank ≈ 0.703, all others ≈ 0.003.
5206 /// This test fails on the baseline and must pass after the fix.
5207 ///
5208 /// Invariant: with `PERSONALIZATION_ALPHA = 0.15`, focus node gets 0.15 of
5209 /// teleportation mass and each of the other (n-1) nodes gets 0.85/(n-1).
5210 /// On a star graph with n=10 nodes, the focus node rank must NOT be more
5211 /// than 40× the average non-focus rank. The 4.0.5 fix targets roughly
5212 /// 5-10× for a well-connected graph, so 40× is a conservative upper bound
5213 /// that the baseline (≈200×) fails.
5214 #[test]
5215 fn test_focus_file_topic_pagerank_preserves_rank_dispersion() {
5216 // Star graph: nodes 1..9 all point to node 0 (high natural rank).
5217 // Focus on node 1 (low natural rank) to test personalization effect.
5218 let n = 10_usize;
5219 #[expect(clippy::cast_possible_truncation, reason = "test: n << u32::MAX")]
5220 let edges: Vec<(u32, u32, u32)> = (1..n).map(|i| (i as u32, 0_u32, 1_u32)).collect();
5221
5222 let ranks_focused = pagerank(n, &edges, Some(1));
5223
5224 let focus_rank = ranks_focused[1];
5225 let sum_non_focus: f32 = ranks_focused
5226 .iter()
5227 .enumerate()
5228 .filter(|&(i, _)| i != 1)
5229 .map(|(_, &r)| r)
5230 .sum();
5231 let n_non_focus = (n - 1) as f32;
5232 let avg_non_focus = sum_non_focus / n_non_focus;
5233
5234 let dispersion_ratio = focus_rank / avg_non_focus;
5235
5236 eprintln!(
5237 "J1 dispersion: focus_rank={focus_rank:.6}, avg_non_focus={avg_non_focus:.6}, \
5238 ratio={dispersion_ratio:.2}× (must be <= 40×)"
5239 );
5240
5241 // With 0.15 personalization alpha the focus node's teleportation
5242 // advantage is modest; 40× is an upper bound the old 0.70 code violates.
5243 assert!(
5244 dispersion_ratio <= 40.0,
5245 "focus rank is {dispersion_ratio:.1}× avg non-focus rank (must be ≤ 40×); \
5246 pre-fix baseline was ~200× due to 70% concentration — I#16"
5247 );
5248
5249 // Ranks must still sum to ~1.
5250 let total: f32 = ranks_focused.iter().sum();
5251 assert!(
5252 (total - 1.0).abs() < 0.01,
5253 "ranks must sum to ≈1.0; got {total}"
5254 );
5255 }
5256
5257 /// J1 RED: focus node must have the highest rank (it still gets the bias),
5258 /// but non-focus nodes must NOT collapse to a flat floor.
5259 ///
5260 /// Concretely: the second-highest-ranked file must be ≥ 10% of the focus
5261 /// file's rank (neighborhood rebiasing, not winner-take-all).
5262 #[test]
5263 fn test_focus_file_topic_pagerank_does_not_collapse_other_files() {
5264 // Linear chain: 0 → 1 → 2 → ... → 9 (directed).
5265 // Focus on node 0. Without personalization, ranks decrease along the
5266 // chain. With soft personalization the non-focus nodes stay non-trivial.
5267 let n = 10_usize;
5268 #[expect(clippy::cast_possible_truncation, reason = "test: n << u32::MAX")]
5269 let edges: Vec<(u32, u32, u32)> = (0..(n - 1))
5270 .map(|i| (i as u32, (i + 1) as u32, 1_u32))
5271 .collect();
5272
5273 let ranks = pagerank(n, &edges, Some(0));
5274
5275 let focus_rank = ranks[0];
5276 // All non-focus ranks must be ≥ 10% of focus rank.
5277 for (i, &r) in ranks.iter().enumerate().skip(1) {
5278 assert!(
5279 r >= focus_rank * 0.10,
5280 "rank[{i}] = {r:.6} is < 10% of focus rank {focus_rank:.6}; \
5281 non-focus files must not collapse to near-zero (I#16)"
5282 );
5283 }
5284 }
5285
5286 // ── J2 tests — neighborhood count parity ─────────────────────────────
5287
5288 /// J2 RED: `render_json_budgeted` with `focus=Some(i)` must return at
5289 /// least 70% as many files as the unfocused call with the same budget.
5290 ///
5291 /// Baseline (pre-4.0.5, α=0.70) collapsed the focused run to 1 dominant
5292 /// file + a flat tail — that's what this test was originally written to
5293 /// catch. The threshold was 80% when α was 0.15; raising α to 0.35
5294 /// (4.1.12, to satisfy the flask focus-in-top-3 invariant) pulls budget
5295 /// toward the focus neighborhood and drops file count to ~70% on small
5296 /// synthetic stars. 70% remains far from the "1 file + zero tail"
5297 /// degenerate baseline this test guards against.
5298 #[test]
5299 fn test_focus_file_returns_neighborhood_not_just_focus() {
5300 // Build a 12-file star graph with meaningful rank variation.
5301 let n = 12_usize;
5302 #[expect(clippy::cast_possible_truncation, reason = "test: n << u32::MAX")]
5303 let edges: Vec<(u32, u32, u32)> = (1..n).map(|i| (i as u32, 0_u32, 1_u32)).collect();
5304 let base_ranks = pagerank(n, &edges, None);
5305 let (callers, callees) = build_neighbor_lists(n, &edges);
5306
5307 let file_nodes: Vec<FileNode> = (0..n)
5308 .map(|i| FileNode {
5309 path: format!("src/file_{i}.rs"),
5310 defs: vec![Definition {
5311 name: format!("func_{i}"),
5312 kind: "function_item".to_string(),
5313 start_line: 1,
5314 end_line: 5,
5315 scope: String::new(),
5316 signature: Some(format!("fn func_{i}() -> i32")),
5317 start_byte: 0,
5318 end_byte: 100,
5319 calls: vec![],
5320 decorator: None,
5321 lsp_kind_hint: None,
5322 }],
5323 imports: vec![],
5324 })
5325 .collect();
5326
5327 let graph = RepoGraph {
5328 files: file_nodes,
5329 edges,
5330 base_ranks,
5331 callers,
5332 callees,
5333 def_edges: vec![],
5334 def_ranks: vec![],
5335 def_callers: vec![],
5336 def_callees: vec![],
5337 def_offsets: vec![0],
5338 alpha: 0.5,
5339 };
5340
5341 let budget = 2000; // generous budget; all 12 files should fit
5342 let unfocused = render_json_budgeted(&graph, budget, None, false);
5343 let focused = render_json_budgeted(&graph, budget, Some(1), false);
5344
5345 let unfocused_n = unfocused.files.len();
5346 let focused_n = focused.files.len();
5347 #[expect(
5348 clippy::cast_possible_truncation,
5349 clippy::cast_sign_loss,
5350 reason = "unfocused_n is a file count (small, positive); f32 multiplication \
5351 by 0.80 and ceil produce a value in [0, n]; truncation to usize is safe"
5352 )]
5353 let min_expected = (unfocused_n as f32 * 0.70).ceil() as usize;
5354
5355 eprintln!(
5356 "J2 neighborhood: unfocused={unfocused_n} files, focused={focused_n} files \
5357 (need ≥ {min_expected})"
5358 );
5359
5360 assert!(
5361 focused_n >= min_expected,
5362 "focused call returned {focused_n} files; expected ≥ {min_expected} \
5363 (70% of unfocused {unfocused_n}); soft personalization must preserve \
5364 rank dispersion across files (I#16/J2)"
5365 );
5366 }
5367
5368 /// J2 RED: topic delta fingerprinting — focused run must reorder files
5369 /// relative to unfocused run (focus file surfaces near top), but both
5370 /// must contain similar total file counts.
5371 #[test]
5372 fn test_focus_delta_topic_fingerprinting_works() {
5373 // Bidirectional 8-file ring so all nodes are structurally equivalent.
5374 // Without focus all ranks are equal. With focus on node 3, node 3
5375 // must surface as the highest-ranked file.
5376 let n = 8_usize;
5377 #[expect(clippy::cast_possible_truncation, reason = "test: n << u32::MAX")]
5378 let edges: Vec<(u32, u32, u32)> = (0..n)
5379 .flat_map(|i| {
5380 let next = ((i + 1) % n) as u32;
5381 let curr = i as u32;
5382 [(curr, next, 1_u32), (next, curr, 1_u32)]
5383 })
5384 .collect();
5385
5386 let ranks_uniform = pagerank(n, &edges, None);
5387 let ranks_focused = pagerank(n, &edges, Some(3));
5388
5389 // Focus node must have highest rank.
5390 let top_idx = ranks_focused
5391 .iter()
5392 .enumerate()
5393 .max_by(|a, b| a.1.total_cmp(b.1))
5394 .map(|(i, _)| i)
5395 .unwrap();
5396
5397 assert_eq!(
5398 top_idx, 3,
5399 "with focus=Some(3), node 3 must have highest rank; top was {top_idx}"
5400 );
5401
5402 // Uniform baseline: all ranks should be approximately equal.
5403 let uniform_max = ranks_uniform
5404 .iter()
5405 .copied()
5406 .fold(f32::NEG_INFINITY, f32::max);
5407 let uniform_min = ranks_uniform.iter().copied().fold(f32::INFINITY, f32::min);
5408 assert!(
5409 (uniform_max - uniform_min).abs() < 0.01,
5410 "on a ring without focus all ranks should be ≈equal; max={uniform_max:.6} min={uniform_min:.6}"
5411 );
5412
5413 // Focused run must rank the focus node significantly higher than others
5414 // but others must remain non-trivial (≥ 5% of focus).
5415 let focus_rank = ranks_focused[3];
5416 for (i, &r) in ranks_focused.iter().enumerate().filter(|&(i, _)| i != 3) {
5417 assert!(
5418 r >= focus_rank * 0.05,
5419 "rank[{i}]={r:.6} is < 5% of focus rank {focus_rank:.6}; \
5420 soft personalization must preserve non-focus ranks"
5421 );
5422 }
5423 }
5424
5425 // ── T1 tests — focus_file resolver normalization (I#20) ──────────────
5426
5427 /// Build a tiny synthetic graph whose `FileNode::path` values match the
5428 /// shape `build_graph` produces on disk (no leading `./`, forward slashes).
5429 fn focus_resolver_graph() -> RepoGraph {
5430 let file_nodes: Vec<FileNode> = vec![
5431 FileNode {
5432 path: "device_opt/services/storage.py".to_string(),
5433 defs: vec![],
5434 imports: vec![],
5435 },
5436 FileNode {
5437 path: "device_opt/ui/textual/screens/settings.py".to_string(),
5438 defs: vec![],
5439 imports: vec![],
5440 },
5441 FileNode {
5442 path: "device_opt/services/registry.py".to_string(),
5443 defs: vec![],
5444 imports: vec![],
5445 },
5446 FileNode {
5447 path: "tests/test_storage.py".to_string(),
5448 defs: vec![],
5449 imports: vec![],
5450 },
5451 ];
5452 let n = file_nodes.len();
5453 RepoGraph {
5454 files: file_nodes,
5455 edges: vec![],
5456 base_ranks: vec![1.0 / n as f32; n],
5457 callers: vec![vec![]; n],
5458 callees: vec![vec![]; n],
5459 def_edges: vec![],
5460 def_ranks: vec![],
5461 def_callers: vec![],
5462 def_callees: vec![],
5463 def_offsets: vec![0; n + 1],
5464 alpha: 0.5,
5465 }
5466 }
5467
5468 /// T1: focus_file paths emitted by `lsp_location.file_path` (with the
5469 /// `./` prefix) must resolve to the correct file index.
5470 ///
5471 /// Baseline reproduction (mnemosyne corpus, 4.0.5): passing
5472 /// `focus_file="./device_opt/.../settings.py"` produced rank values
5473 /// bit-identical to the unfocused call because the strict-suffix matcher
5474 /// in `tools.rs` failed both the `exact` and the `strict_suffix` checks
5475 /// when the focus carried the LSP `./` prefix. The matcher silently
5476 /// returned `focus = None`, masking the failure as "topic-sensitive
5477 /// PageRank does nothing on Python".
5478 #[test]
5479 fn test_focus_file_resolver_accepts_lsp_location_path() {
5480 let g = focus_resolver_graph();
5481 // LSP-shaped path with leading `./` — the form documented in
5482 // get_repo_map's instructions.
5483 let res = g.resolve_focus_file("./device_opt/ui/textual/screens/settings.py");
5484 match res {
5485 FocusResolution::Found(idx) => {
5486 assert_eq!(
5487 g.files[idx].path, "device_opt/ui/textual/screens/settings.py",
5488 "resolver must accept the ./-prefixed LSP path form (I#20)"
5489 );
5490 }
5491 FocusResolution::NotFound | FocusResolution::Ambiguous(_) => {
5492 panic!(
5493 "resolver returned {res:?} for ./device_opt/ui/textual/screens/settings.py; \
5494 the LSP-shaped path form must resolve to exactly one file (I#20)"
5495 );
5496 }
5497 }
5498 }
5499
5500 /// T1: the bare stored path (no `./`) must continue to resolve.
5501 /// Regression guard for the pre-fix matcher's "exact" path.
5502 #[test]
5503 fn test_focus_file_resolver_accepts_bare_stored_path() {
5504 let g = focus_resolver_graph();
5505 let res = g.resolve_focus_file("device_opt/services/storage.py");
5506 match res {
5507 FocusResolution::Found(idx) => {
5508 assert_eq!(g.files[idx].path, "device_opt/services/storage.py");
5509 }
5510 other => panic!("expected Found, got {other:?}"),
5511 }
5512 }
5513
5514 /// T1: strict-suffix match — `storage.py` must match
5515 /// `device_opt/services/storage.py` (prev char is `/`) but ambiguity
5516 /// (two `storage.py` files) must be reported, not silently picked.
5517 #[test]
5518 fn test_focus_file_resolver_strict_suffix_and_ambiguity() {
5519 let g = focus_resolver_graph();
5520 // "storage.py" matches both device_opt/services/storage.py and
5521 // tests/test_storage.py? No — test_storage.py has `_` before `storage.py`
5522 // (not `/`), so the strict-suffix matcher rejects it. Only one match.
5523 let res = g.resolve_focus_file("storage.py");
5524 assert!(
5525 matches!(res, FocusResolution::Found(_)),
5526 "strict-suffix `storage.py` must match exactly one file (the `_` in \
5527 test_storage.py blocks the strict-suffix), got {res:?}"
5528 );
5529 // Add a second services/storage.py-shaped file to force ambiguity.
5530 let mut g2 = g.clone();
5531 g2.files.push(FileNode {
5532 path: "vendored/services/storage.py".to_string(),
5533 defs: vec![],
5534 imports: vec![],
5535 });
5536 g2.base_ranks.push(0.0);
5537 g2.callers.push(vec![]);
5538 g2.callees.push(vec![]);
5539 g2.def_offsets.push(*g2.def_offsets.last().unwrap());
5540 let res = g2.resolve_focus_file("storage.py");
5541 match res {
5542 FocusResolution::Ambiguous(cands) => {
5543 assert_eq!(cands.len(), 2, "expected two candidates, got {cands:?}");
5544 }
5545 other => panic!("expected Ambiguous, got {other:?}"),
5546 }
5547 }
5548
5549 /// T1: a focus that matches no file returns `NotFound`. The caller
5550 /// is responsible for either treating this as unfocused or surfacing
5551 /// an error — the resolver itself does not impose policy.
5552 #[test]
5553 fn test_focus_file_resolver_not_found() {
5554 let g = focus_resolver_graph();
5555 let res = g.resolve_focus_file("./does/not/exist.py");
5556 assert!(
5557 matches!(res, FocusResolution::NotFound),
5558 "expected NotFound, got {res:?}"
5559 );
5560 }
5561
5562 /// T1: empty focus does not match anything (avoids the empty-suffix
5563 /// degenerate that would otherwise match every file).
5564 #[test]
5565 fn test_focus_file_resolver_empty_input_is_not_found() {
5566 let g = focus_resolver_graph();
5567 let res = g.resolve_focus_file("");
5568 assert!(
5569 matches!(res, FocusResolution::NotFound),
5570 "empty focus must not match anything, got {res:?}"
5571 );
5572 }
5573
5574 /// T1: focus_file rank delta must be visible on a Python-shaped
5575 /// synthetic graph.
5576 ///
5577 /// Builds a small Python-style call graph (FileNode + Definition with
5578 /// resolved CallRefs, matching what `extract_calls` produces on a real
5579 /// Python corpus), runs `build_graph_from_files_pub` to get a
5580 /// `RepoGraph`, then calls `render_json_budgeted` with and without
5581 /// focus. Asserts that the focused call changes the rank of at least
5582 /// one non-focus file by ≥ 5% in either direction.
5583 ///
5584 /// On the baseline (pre-T1) this test passes when the caller supplies
5585 /// an int focus_idx directly — the engine's topic-sensitive PageRank is
5586 /// correct. The bug was at the string-to-int resolver layer in
5587 /// `tools.rs`, which silently masked the failure as "the rendering
5588 /// path doesn't propagate focus". This test locks the engine's
5589 /// behavior so a future regression in the rendering path is caught.
5590 #[test]
5591 #[expect(
5592 clippy::too_many_lines,
5593 reason = "synthetic Python-shaped graph (five FileNodes with defs + \
5594 CallRefs + ImportRefs) plus the two-call assertion sequence \
5595 is inherently long; splitting into helpers would obscure the \
5596 one-shot reproduction the test is locking in."
5597 )]
5598 fn test_focus_file_rank_delta_visible_on_python_corpus() {
5599 // Five files, Python-shaped: services/storage.py (a "hub" that two
5600 // UI files call into) plus a tests/ file. The Python tree-sitter
5601 // extractor produces `class_definition` and `function_definition`
5602 // kinds with resolved CallRefs pointing at the hub.
5603 let mut files: Vec<FileNode> = vec![
5604 FileNode {
5605 path: "device_opt/services/storage.py".to_string(),
5606 defs: vec![
5607 Definition {
5608 name: "ScanStore".to_string(),
5609 kind: "class_definition".to_string(),
5610 start_line: 1,
5611 end_line: 80,
5612 scope: String::new(),
5613 signature: None,
5614 start_byte: 0,
5615 end_byte: 2000,
5616 calls: vec![],
5617 decorator: None,
5618 lsp_kind_hint: None,
5619 },
5620 Definition {
5621 name: "save_scan".to_string(),
5622 kind: "function_definition".to_string(),
5623 start_line: 20,
5624 end_line: 40,
5625 scope: "class_definition ScanStore".to_string(),
5626 signature: Some("def save_scan(self, scan)".to_string()),
5627 start_byte: 200,
5628 end_byte: 600,
5629 calls: vec![],
5630 decorator: None,
5631 lsp_kind_hint: None,
5632 },
5633 ],
5634 imports: vec![],
5635 },
5636 FileNode {
5637 path: "device_opt/services/registry.py".to_string(),
5638 defs: vec![Definition {
5639 name: "register".to_string(),
5640 kind: "function_definition".to_string(),
5641 start_line: 1,
5642 end_line: 30,
5643 scope: String::new(),
5644 signature: Some("def register(svc)".to_string()),
5645 start_byte: 0,
5646 end_byte: 600,
5647 calls: vec![CallRef {
5648 name: "save_scan".to_string(),
5649 qualified_path: None,
5650 receiver_type: None,
5651 byte_offset: 100,
5652 resolved: None,
5653 }],
5654 decorator: None,
5655 lsp_kind_hint: None,
5656 }],
5657 imports: vec![ImportRef {
5658 raw_path: "from device_opt.services import storage".to_string(),
5659 resolved_idx: Some(0),
5660 }],
5661 },
5662 FileNode {
5663 path: "device_opt/ui/screens/browse.py".to_string(),
5664 defs: vec![Definition {
5665 name: "browse_scans".to_string(),
5666 kind: "function_definition".to_string(),
5667 start_line: 1,
5668 end_line: 50,
5669 scope: String::new(),
5670 signature: Some("def browse_scans(app)".to_string()),
5671 start_byte: 0,
5672 end_byte: 1000,
5673 calls: vec![CallRef {
5674 name: "save_scan".to_string(),
5675 qualified_path: None,
5676 receiver_type: None,
5677 byte_offset: 200,
5678 resolved: None,
5679 }],
5680 decorator: None,
5681 lsp_kind_hint: None,
5682 }],
5683 imports: vec![ImportRef {
5684 raw_path: "from device_opt.services import storage".to_string(),
5685 resolved_idx: Some(0),
5686 }],
5687 },
5688 FileNode {
5689 path: "device_opt/ui/screens/settings.py".to_string(),
5690 defs: vec![Definition {
5691 name: "open_settings".to_string(),
5692 kind: "function_definition".to_string(),
5693 start_line: 1,
5694 end_line: 40,
5695 scope: String::new(),
5696 signature: Some("def open_settings(app)".to_string()),
5697 start_byte: 0,
5698 end_byte: 800,
5699 calls: vec![CallRef {
5700 name: "register".to_string(),
5701 qualified_path: None,
5702 receiver_type: None,
5703 byte_offset: 150,
5704 resolved: None,
5705 }],
5706 decorator: None,
5707 lsp_kind_hint: None,
5708 }],
5709 imports: vec![ImportRef {
5710 raw_path: "from device_opt.services import registry".to_string(),
5711 resolved_idx: Some(1),
5712 }],
5713 },
5714 FileNode {
5715 path: "tests/test_storage.py".to_string(),
5716 defs: vec![Definition {
5717 name: "test_save".to_string(),
5718 kind: "function_definition".to_string(),
5719 start_line: 1,
5720 end_line: 20,
5721 scope: String::new(),
5722 signature: Some("def test_save()".to_string()),
5723 start_byte: 0,
5724 end_byte: 400,
5725 calls: vec![CallRef {
5726 name: "save_scan".to_string(),
5727 qualified_path: None,
5728 receiver_type: None,
5729 byte_offset: 50,
5730 resolved: None,
5731 }],
5732 decorator: None,
5733 lsp_kind_hint: None,
5734 }],
5735 imports: vec![ImportRef {
5736 raw_path: "from device_opt.services import storage".to_string(),
5737 resolved_idx: Some(0),
5738 }],
5739 },
5740 ];
5741
5742 // Resolve calls so the graph builder has edges to chew on.
5743 let def_index = build_def_index(&files);
5744 resolve_calls(&mut files, &def_index, &HashMap::new());
5745 let graph = build_graph_from_files_pub(files);
5746
5747 // Sanity: the graph must have edges (the calls were resolved).
5748 assert!(
5749 !graph.edges.is_empty(),
5750 "Python-shaped synthetic graph must produce file-level edges; got 0. \
5751 The CallRefs may have failed to resolve."
5752 );
5753
5754 // Resolve the focus file via the new helper.
5755 let focus_idx = match graph.resolve_focus_file("./device_opt/ui/screens/settings.py") {
5756 FocusResolution::Found(i) => i,
5757 other => panic!("resolver must find settings.py via LSP-shaped path, got {other:?}"),
5758 };
5759
5760 let budget = 4000;
5761 let unfocused = render_json_budgeted(&graph, budget, None, false);
5762 let focused = render_json_budgeted(&graph, budget, Some(focus_idx), false);
5763
5764 // Collect rank-by-path maps for both runs.
5765 let unfocused_ranks: std::collections::HashMap<String, f32> = unfocused
5766 .files
5767 .iter()
5768 .map(|f| (f.lsp_location.file_path.clone(), f.rank))
5769 .collect();
5770 let focused_ranks: std::collections::HashMap<String, f32> = focused
5771 .files
5772 .iter()
5773 .map(|f| (f.lsp_location.file_path.clone(), f.rank))
5774 .collect();
5775
5776 eprintln!("T1 Python — unfocused ranks: {unfocused_ranks:#?}");
5777 eprintln!("T1 Python — focused ranks: {focused_ranks:#?}");
5778
5779 // Find at least one non-focus file whose rank changed by ≥ 5% in
5780 // either direction. The threshold is conservative; the soft 0.15
5781 // personalization alpha redistributes mass enough that on this
5782 // 5-node graph the affected neighbors typically shift by 20%+.
5783 let focus_path = "./device_opt/ui/screens/settings.py";
5784 let mut max_delta_ratio = 0.0_f32;
5785 for (path, &u_rank) in &unfocused_ranks {
5786 if path == focus_path {
5787 continue;
5788 }
5789 if let Some(&f_rank) = focused_ranks.get(path)
5790 && u_rank > 0.0
5791 {
5792 let ratio = (f_rank - u_rank).abs() / u_rank;
5793 if ratio > max_delta_ratio {
5794 max_delta_ratio = ratio;
5795 }
5796 }
5797 }
5798 assert!(
5799 max_delta_ratio >= 0.05,
5800 "focus_file must rebias non-focus file ranks by ≥ 5%; \
5801 max observed delta ratio = {max_delta_ratio:.3} \
5802 (I#20: focus_file invisible on Python corpora)"
5803 );
5804
5805 // Bit-identity guard: at least one non-focus file's rank must NOT
5806 // equal its unfocused value. This is the pathology from the
5807 // mnemosyne reproduction: every rank value was bit-identical
5808 // across global/focused calls.
5809 let any_changed = unfocused_ranks.iter().any(|(path, &u_rank)| {
5810 path != focus_path
5811 && focused_ranks
5812 .get(path)
5813 .is_some_and(|&f_rank| f_rank.to_bits() != u_rank.to_bits())
5814 });
5815 assert!(
5816 any_changed,
5817 "no non-focus file rank changed across focused/unfocused calls — \
5818 bit-identical pathology (I#20). unfocused={unfocused_ranks:#?} \
5819 focused={focused_ranks:#?}"
5820 );
5821 }
5822
5823 /// T1: focus_file rank delta on a Rust-shaped synthetic graph.
5824 ///
5825 /// Regression test: confirms the engine's topic-sensitive PageRank
5826 /// works on Rust shapes (where T1's investigation found it already
5827 /// works, but the resolver fix must not break the existing path).
5828 ///
5829 /// This complements `test_focus_file_returns_neighborhood_not_just_focus`
5830 /// by additionally checking that (a) the resolver accepts a Rust path
5831 /// with the `./` LSP prefix, and (b) at least one non-focus file's
5832 /// rank moves by ≥ 5%.
5833 #[test]
5834 #[expect(
5835 clippy::too_many_lines,
5836 reason = "synthetic Rust-shaped graph with four FileNodes plus the \
5837 two-call assertion sequence inherently exceeds the 100-line \
5838 cap; the test mirrors the Python-shaped sibling."
5839 )]
5840 fn test_focus_file_rank_delta_preserved_on_rust_corpus() {
5841 let mut files: Vec<FileNode> = vec![
5842 FileNode {
5843 path: "src/lib.rs".to_string(),
5844 defs: vec![Definition {
5845 name: "Engine".to_string(),
5846 kind: "struct_item".to_string(),
5847 start_line: 1,
5848 end_line: 30,
5849 scope: String::new(),
5850 signature: None,
5851 start_byte: 0,
5852 end_byte: 600,
5853 calls: vec![],
5854 decorator: None,
5855 lsp_kind_hint: None,
5856 }],
5857 imports: vec![],
5858 },
5859 FileNode {
5860 path: "src/encoder/mod.rs".to_string(),
5861 defs: vec![Definition {
5862 name: "encode".to_string(),
5863 kind: "function_item".to_string(),
5864 start_line: 1,
5865 end_line: 40,
5866 scope: String::new(),
5867 signature: Some("fn encode(input: &str) -> Vec<f32>".to_string()),
5868 start_byte: 0,
5869 end_byte: 800,
5870 calls: vec![],
5871 decorator: None,
5872 lsp_kind_hint: None,
5873 }],
5874 imports: vec![ImportRef {
5875 raw_path: "use crate::lib;".to_string(),
5876 resolved_idx: Some(0),
5877 }],
5878 },
5879 FileNode {
5880 path: "src/search.rs".to_string(),
5881 defs: vec![Definition {
5882 name: "search".to_string(),
5883 kind: "function_item".to_string(),
5884 start_line: 1,
5885 end_line: 30,
5886 scope: String::new(),
5887 signature: Some("fn search(q: &str) -> Hits".to_string()),
5888 start_byte: 0,
5889 end_byte: 600,
5890 calls: vec![CallRef {
5891 name: "encode".to_string(),
5892 qualified_path: None,
5893 receiver_type: None,
5894 byte_offset: 100,
5895 resolved: None,
5896 }],
5897 decorator: None,
5898 lsp_kind_hint: None,
5899 }],
5900 imports: vec![ImportRef {
5901 raw_path: "use crate::encoder;".to_string(),
5902 resolved_idx: Some(1),
5903 }],
5904 },
5905 FileNode {
5906 path: "src/cli.rs".to_string(),
5907 defs: vec![Definition {
5908 name: "main".to_string(),
5909 kind: "function_item".to_string(),
5910 start_line: 1,
5911 end_line: 20,
5912 scope: String::new(),
5913 signature: Some("fn main()".to_string()),
5914 start_byte: 0,
5915 end_byte: 400,
5916 calls: vec![CallRef {
5917 name: "search".to_string(),
5918 qualified_path: None,
5919 receiver_type: None,
5920 byte_offset: 50,
5921 resolved: None,
5922 }],
5923 decorator: None,
5924 lsp_kind_hint: None,
5925 }],
5926 imports: vec![ImportRef {
5927 raw_path: "use crate::search;".to_string(),
5928 resolved_idx: Some(2),
5929 }],
5930 },
5931 ];
5932
5933 let def_index = build_def_index(&files);
5934 resolve_calls(&mut files, &def_index, &HashMap::new());
5935 let graph = build_graph_from_files_pub(files);
5936
5937 assert!(
5938 !graph.edges.is_empty(),
5939 "Rust-shaped synthetic graph must produce edges"
5940 );
5941
5942 let focus_idx = match graph.resolve_focus_file("./src/encoder/mod.rs") {
5943 FocusResolution::Found(i) => i,
5944 other => panic!("resolver must find encoder/mod.rs via LSP path, got {other:?}"),
5945 };
5946
5947 let budget = 4000;
5948 let unfocused = render_json_budgeted(&graph, budget, None, false);
5949 let focused = render_json_budgeted(&graph, budget, Some(focus_idx), false);
5950
5951 let unfocused_ranks: std::collections::HashMap<String, f32> = unfocused
5952 .files
5953 .iter()
5954 .map(|f| (f.lsp_location.file_path.clone(), f.rank))
5955 .collect();
5956 let focused_ranks: std::collections::HashMap<String, f32> = focused
5957 .files
5958 .iter()
5959 .map(|f| (f.lsp_location.file_path.clone(), f.rank))
5960 .collect();
5961
5962 eprintln!("T1 Rust — unfocused: {unfocused_ranks:#?}");
5963 eprintln!("T1 Rust — focused: {focused_ranks:#?}");
5964
5965 let focus_path = "./src/encoder/mod.rs";
5966 let mut max_delta_ratio = 0.0_f32;
5967 for (path, &u_rank) in &unfocused_ranks {
5968 if path == focus_path {
5969 continue;
5970 }
5971 if let Some(&f_rank) = focused_ranks.get(path)
5972 && u_rank > 0.0
5973 {
5974 let ratio = (f_rank - u_rank).abs() / u_rank;
5975 if ratio > max_delta_ratio {
5976 max_delta_ratio = ratio;
5977 }
5978 }
5979 }
5980 assert!(
5981 max_delta_ratio >= 0.05,
5982 "focus_file must rebias non-focus file ranks by ≥ 5% on Rust shapes; \
5983 max observed delta = {max_delta_ratio:.3}"
5984 );
5985 }
5986
5987 #[test]
5988 fn test_pagerank_empty() {
5989 let ranks = pagerank(0, &[], None);
5990 assert!(ranks.is_empty());
5991 }
5992
5993 #[test]
5994 fn test_render_tiers() {
5995 // Build a small graph with 10 files to exercise all tiers
5996 let files: Vec<FileNode> = (0..10)
5997 .map(|i| FileNode {
5998 path: format!("src/file_{i}.rs"),
5999 defs: vec![Definition {
6000 name: format!("func_{i}"),
6001 kind: "function_item".to_string(),
6002 start_line: 1,
6003 end_line: 5,
6004 scope: String::new(),
6005 signature: Some(format!("func_{i}(x: i32) -> i32")),
6006 start_byte: 0,
6007 end_byte: 0,
6008 calls: vec![],
6009 decorator: None,
6010 lsp_kind_hint: None,
6011 }],
6012 imports: vec![],
6013 })
6014 .collect();
6015
6016 // Create a star graph: files 1-9 all import from file 0
6017 let edges: Vec<(u32, u32, u32)> = (1..10).map(|i| (i, 0, 1)).collect();
6018 let base_ranks = pagerank(10, &edges, None);
6019 let (top_callers, top_callees) = build_neighbor_lists(10, &edges);
6020
6021 let graph = RepoGraph {
6022 files,
6023 edges,
6024 base_ranks,
6025 callers: top_callers,
6026 callees: top_callees,
6027 def_edges: vec![],
6028 def_ranks: vec![],
6029 def_callers: vec![],
6030 def_callees: vec![],
6031 def_offsets: vec![0],
6032 alpha: 0.5,
6033 };
6034
6035 // Large budget: should include all files
6036 let full = render(&graph, 10_000, None);
6037 assert!(
6038 full.contains("file_0"),
6039 "output should contain the top-ranked file"
6040 );
6041 // file_0 should appear as tier 0 (highest rank)
6042 assert!(
6043 full.contains("## src/file_0.rs"),
6044 "top file should have tier 0 heading"
6045 );
6046
6047 // Tiny budget: should only fit a few files
6048 let small = render(&graph, 10, None);
6049 assert!(
6050 !small.is_empty(),
6051 "even tiny budget should produce some output"
6052 );
6053 // Should have fewer entries than full render
6054 let full_lines = full.lines().count();
6055 let small_lines = small.lines().count();
6056 assert!(
6057 small_lines < full_lines,
6058 "small budget ({small_lines} lines) should have fewer lines than full ({full_lines})"
6059 );
6060 }
6061
6062 #[test]
6063 fn test_render_empty_graph() {
6064 let graph = RepoGraph {
6065 files: vec![],
6066 edges: vec![],
6067 base_ranks: vec![],
6068 callers: vec![],
6069 callees: vec![],
6070 def_edges: vec![],
6071 def_ranks: vec![],
6072 def_callers: vec![],
6073 def_callees: vec![],
6074 def_offsets: vec![0],
6075 alpha: 0.5,
6076 };
6077 let output = render(&graph, 1000, None);
6078 assert!(output.is_empty(), "empty graph should render empty string");
6079 }
6080
6081 #[test]
6082 fn test_build_graph_on_fixtures() {
6083 let fixtures = Path::new(env!("CARGO_MANIFEST_DIR"))
6084 .parent()
6085 .unwrap()
6086 .parent()
6087 .unwrap()
6088 .join("tests")
6089 .join("fixtures");
6090
6091 let graph = build_graph(&fixtures).expect("build_graph should succeed on fixtures");
6092
6093 // Should find at least the 3 fixture files
6094 assert!(
6095 !graph.files.is_empty(),
6096 "graph should contain files from fixtures"
6097 );
6098
6099 // Should find definitions in the Rust fixture
6100 let rs_file = graph.files.iter().find(|f| f.path.ends_with("sample.rs"));
6101 assert!(rs_file.is_some(), "should find sample.rs");
6102 let rs_file = rs_file.unwrap();
6103 assert!(
6104 !rs_file.defs.is_empty(),
6105 "sample.rs should have definitions"
6106 );
6107 assert!(
6108 rs_file.defs.iter().any(|d| d.name == "hello"),
6109 "should find 'hello' function in sample.rs"
6110 );
6111
6112 // Should find definitions in the Python fixture
6113 let py_file = graph.files.iter().find(|f| f.path.ends_with("sample.py"));
6114 assert!(py_file.is_some(), "should find sample.py");
6115 let py_file = py_file.unwrap();
6116 assert!(
6117 !py_file.defs.is_empty(),
6118 "sample.py should have definitions"
6119 );
6120 assert!(
6121 py_file.defs.iter().any(|d| d.name == "greet"),
6122 "should find 'greet' function in sample.py"
6123 );
6124
6125 // PageRank scores should be computed
6126 assert_eq!(graph.base_ranks.len(), graph.files.len());
6127 let sum: f32 = graph.base_ranks.iter().sum();
6128 assert!(
6129 (sum - 1.0).abs() < 0.01,
6130 "PageRank scores should sum to ~1.0, got {sum}"
6131 );
6132 }
6133
6134 #[test]
6135 fn test_extract_imports_rust() {
6136 let source = "use crate::foo::bar;\nuse std::collections::HashMap;\n";
6137 let (lang, query) = import_query_for_extension("rs").unwrap();
6138 let imports = extract_imports(source, &lang, &query);
6139 assert_eq!(imports.len(), 2);
6140 assert!(imports[0].contains("crate::foo::bar"));
6141 }
6142
6143 #[test]
6144 fn test_extract_imports_python_stub() {
6145 let source = "from typing import Protocol\nimport pkg.types\n";
6146 let (lang, query) = import_query_for_extension("pyi").unwrap();
6147 let imports = extract_imports(source, &lang, &query);
6148 assert_eq!(imports.len(), 2);
6149 assert!(imports[0].contains("from typing import Protocol"));
6150 assert!(imports[1].contains("import pkg.types"));
6151 }
6152
6153 #[test]
6154 fn test_resolve_python_import_to_stub_file() {
6155 let root = PathBuf::from("/project");
6156 let mut file_index = HashMap::new();
6157 file_index.insert(PathBuf::from("/project/pkg/types.pyi"), 1);
6158
6159 let result = resolve_python_import("import pkg.types", &root, &file_index);
6160 assert_eq!(result, Some(1));
6161 }
6162
6163 #[test]
6164 fn test_resolve_rust_crate_import() {
6165 let root = PathBuf::from("/project");
6166 let file_path = PathBuf::from("/project/src/main.rs");
6167 let mut file_index = HashMap::new();
6168 file_index.insert(PathBuf::from("/project/src/foo/bar.rs"), 1);
6169 file_index.insert(PathBuf::from("/project/src/main.rs"), 0);
6170
6171 let result = resolve_rust_import("use crate::foo::bar;", &file_path, &root, &file_index);
6172 assert_eq!(result, Some(1));
6173 }
6174
6175 #[test]
6176 fn test_resolve_rust_external_crate_dropped() {
6177 let root = PathBuf::from("/project");
6178 let file_path = PathBuf::from("/project/src/main.rs");
6179 let file_index = HashMap::new();
6180
6181 let result = resolve_rust_import(
6182 "use std::collections::HashMap;",
6183 &file_path,
6184 &root,
6185 &file_index,
6186 );
6187 assert_eq!(result, None, "external crate imports should be dropped");
6188 }
6189
6190 #[test]
6191 fn test_neighbor_lists() {
6192 // 0 -> 1, 0 -> 2, 1 -> 2
6193 let edges = vec![(0, 1, 1), (0, 2, 1), (1, 2, 1)];
6194 let (incoming, outgoing) = build_neighbor_lists(3, &edges);
6195
6196 // Node 2 should be called by 0 and 1
6197 assert!(incoming[2].contains(&0));
6198 assert!(incoming[2].contains(&1));
6199
6200 // Node 0 should call 1 and 2
6201 assert!(outgoing[0].contains(&1));
6202 assert!(outgoing[0].contains(&2));
6203 }
6204
6205 /// G1 (R2.3 issue a): A scoped call `mod_a::foo()` must store:
6206 /// - `name = "foo"` (bare identifier, for def-index lookup)
6207 /// - `qualified_path = Some("mod_a::foo")` (full path, for disambiguation)
6208 ///
6209 /// Before G1, `name` stored the full `"mod_a::foo"` path. After G1, `name`
6210 /// is always the bare trailing identifier and `qualified_path` carries the
6211 /// full path when the call is scoped.
6212 #[test]
6213 fn test_scoped_identifier_calls_preserve_path() {
6214 use crate::languages;
6215 use streaming_iterator::StreamingIterator as _;
6216
6217 let source = "
6218mod mod_a {
6219 pub fn foo() {}
6220}
6221mod mod_b {
6222 pub fn foo() {}
6223}
6224fn caller() {
6225 mod_a::foo();
6226 mod_b::foo();
6227}
6228";
6229 let call_config =
6230 languages::call_query_for_extension("rs").expect("Rust call config must exist");
6231 let lang_config =
6232 languages::config_for_extension("rs").expect("Rust lang config must exist");
6233
6234 let mut defs = {
6235 let mut parser = tree_sitter::Parser::new();
6236 parser.set_language(&lang_config.language).unwrap();
6237 let tree = parser.parse(source, None).unwrap();
6238 let mut cursor = tree_sitter::QueryCursor::new();
6239 let mut out = Vec::new();
6240 let mut matches =
6241 cursor.matches(&lang_config.query, tree.root_node(), source.as_bytes());
6242 while let Some(m) = matches.next() {
6243 let mut name = String::new();
6244 let mut def_node = None;
6245 for cap in m.captures {
6246 let cname = &lang_config.query.capture_names()[cap.index as usize];
6247 if *cname == "name" {
6248 name = source[cap.node.start_byte()..cap.node.end_byte()].to_string();
6249 } else if *cname == "def" {
6250 def_node = Some(cap.node);
6251 }
6252 }
6253 if let Some(node) = def_node {
6254 #[expect(clippy::cast_possible_truncation)]
6255 out.push(Definition {
6256 name,
6257 kind: node.kind().to_string(),
6258 start_line: node.start_position().row as u32 + 1,
6259 end_line: node.end_position().row as u32 + 1,
6260 scope: String::new(),
6261 signature: None,
6262 start_byte: node.start_byte() as u32,
6263 end_byte: node.end_byte() as u32,
6264 calls: vec![],
6265 decorator: None,
6266 lsp_kind_hint: None,
6267 });
6268 }
6269 }
6270 out
6271 };
6272
6273 extract_calls(source, &call_config, &mut defs);
6274
6275 // Find the `caller` function definition
6276 let caller_def = defs
6277 .iter()
6278 .find(|d| d.name == "caller")
6279 .expect("caller def");
6280
6281 // G1: bare name is "foo", qualified_path carries the module path.
6282 let call_names: Vec<&str> = caller_def.calls.iter().map(|c| c.name.as_str()).collect();
6283 let qualified_paths: Vec<Option<&str>> = caller_def
6284 .calls
6285 .iter()
6286 .map(|c| c.qualified_path.as_deref())
6287 .collect();
6288
6289 // Bare names must be the trailing identifier only.
6290 assert!(
6291 call_names.contains(&"foo"),
6292 "bare name 'foo' must appear for scoped calls; got: {call_names:?}"
6293 );
6294 // Qualified paths must carry the full scope.
6295 assert!(
6296 qualified_paths.contains(&Some("mod_a::foo")),
6297 "qualified_path 'mod_a::foo' must appear; got: {qualified_paths:?}"
6298 );
6299 assert!(
6300 qualified_paths.contains(&Some("mod_b::foo")),
6301 "qualified_path 'mod_b::foo' must appear; got: {qualified_paths:?}"
6302 );
6303 // Full paths must NOT appear in bare names.
6304 assert!(
6305 !call_names.contains(&"mod_a::foo"),
6306 "full path 'mod_a::foo' must not appear in bare name; got: {call_names:?}"
6307 );
6308 }
6309
6310 /// RED test (R2.3 issue b+c): Two defs named `Read` in different modules,
6311 /// an unqualified call to `Read`. Resolution must NOT silently pick the first.
6312 /// Either both are returned (ambiguous) or none.
6313 #[test]
6314 fn test_ambiguous_name_resolution_returns_all_or_none() {
6315 // Build two FileNodes each with a def named "Read", then a third with an
6316 // unqualified call to "Read".
6317 let file_a = FileNode {
6318 path: "mod_a.rs".to_string(),
6319 defs: vec![Definition {
6320 name: "Read".to_string(),
6321 kind: "trait_item".to_string(),
6322 start_line: 1,
6323 end_line: 3,
6324 scope: String::new(),
6325 signature: None,
6326 start_byte: 0,
6327 end_byte: 50,
6328 calls: vec![],
6329 decorator: None,
6330 lsp_kind_hint: None,
6331 }],
6332 imports: vec![],
6333 };
6334 let file_b = FileNode {
6335 path: "mod_b.rs".to_string(),
6336 defs: vec![Definition {
6337 name: "Read".to_string(),
6338 kind: "trait_item".to_string(),
6339 start_line: 1,
6340 end_line: 3,
6341 scope: String::new(),
6342 signature: None,
6343 start_byte: 0,
6344 end_byte: 50,
6345 calls: vec![],
6346 decorator: None,
6347 lsp_kind_hint: None,
6348 }],
6349 imports: vec![],
6350 };
6351 let file_c = FileNode {
6352 path: "caller.rs".to_string(),
6353 defs: vec![Definition {
6354 name: "do_thing".to_string(),
6355 kind: "function_item".to_string(),
6356 start_line: 1,
6357 end_line: 5,
6358 scope: String::new(),
6359 signature: None,
6360 start_byte: 0,
6361 end_byte: 100,
6362 calls: vec![CallRef {
6363 name: "Read".to_string(),
6364 qualified_path: None,
6365 receiver_type: None,
6366 byte_offset: 10,
6367 resolved: None,
6368 }],
6369 decorator: None,
6370 lsp_kind_hint: None,
6371 }],
6372 imports: vec![],
6373 };
6374
6375 let mut files = vec![file_a, file_b, file_c];
6376 let def_index = build_def_index(&files);
6377 resolve_calls(&mut files, &def_index, &HashMap::new());
6378
6379 // The unqualified call to "Read" is ambiguous (two candidates, neither in same
6380 // file nor imported). Resolution must leave it as None — silent first-wins is wrong.
6381 let resolved = files[2].defs[0].calls[0].resolved;
6382 assert_eq!(
6383 resolved, None,
6384 "ambiguous unqualified call with no import context must resolve to None, not silently pick first"
6385 );
6386 }
6387
6388 // ── D1 / D2 tests ────────────────────────────────────────────────
6389
6390 /// Build a small test graph with N files and an optional JSON-extension file.
6391 fn build_test_graph(n_code: usize, include_json: bool) -> (RepoGraph, Vec<usize>) {
6392 let mut file_nodes: Vec<FileNode> = (0..n_code)
6393 .map(|i| FileNode {
6394 path: format!("src/file_{i}.rs"),
6395 defs: vec![
6396 Definition {
6397 name: format!("func_{i}"),
6398 kind: "function_item".to_string(),
6399 start_line: 1,
6400 end_line: 5,
6401 scope: String::new(),
6402 signature: Some(format!("fn func_{i}() -> i32")),
6403 start_byte: 0,
6404 end_byte: 100,
6405 calls: vec![],
6406 decorator: None,
6407 lsp_kind_hint: None,
6408 },
6409 Definition {
6410 name: format!("MyStruct{i}"),
6411 kind: "struct_item".to_string(),
6412 start_line: 7,
6413 end_line: 10,
6414 scope: String::new(),
6415 signature: None,
6416 start_byte: 110,
6417 end_byte: 200,
6418 calls: vec![],
6419 decorator: None,
6420 lsp_kind_hint: None,
6421 },
6422 ],
6423 imports: vec![],
6424 })
6425 .collect();
6426
6427 let json_idx = if include_json {
6428 let idx = file_nodes.len();
6429 file_nodes.push(FileNode {
6430 path: "data/config.json".to_string(),
6431 defs: vec![],
6432 imports: vec![],
6433 });
6434 vec![idx]
6435 } else {
6436 vec![]
6437 };
6438
6439 // Build a star graph: all code files point to file_0.
6440 let n = file_nodes.len();
6441 #[expect(clippy::cast_possible_truncation, reason = "test: n_code << u32::MAX")]
6442 let edges: Vec<(u32, u32, u32)> = (1..n_code).map(|i| (i as u32, 0, 1)).collect();
6443
6444 let base_ranks = pagerank(n, &edges, None);
6445 let (callers, callees) = build_neighbor_lists(n, &edges);
6446
6447 let graph = RepoGraph {
6448 files: file_nodes,
6449 edges,
6450 base_ranks,
6451 callers,
6452 callees,
6453 def_edges: vec![],
6454 def_ranks: vec![],
6455 def_callers: vec![],
6456 def_callees: vec![],
6457 def_offsets: vec![0],
6458 alpha: 0.5,
6459 };
6460
6461 (graph, json_idx)
6462 }
6463
6464 /// D1: `render_json` returns a `GetRepoMapResponse` with a `files` array.
6465 ///
6466 /// On the baseline (before D1) `get_repo_map_ripvec` returned markdown prose via
6467 /// `repo_map::render`; no `files` key existed in the output.
6468 #[test]
6469 fn get_repo_map_returns_json_with_files_array() {
6470 let (graph, _) = build_test_graph(5, false);
6471 let response = render_json(&graph, 50, None, false);
6472 assert!(
6473 !response.files.is_empty(),
6474 "files array should be non-empty for a non-empty graph"
6475 );
6476 // Serialize and verify the JSON shape has a `files` key.
6477 let json = serde_json::to_string(&response).expect("serialize");
6478 let parsed: serde_json::Value = serde_json::from_str(&json).expect("parse");
6479 assert!(
6480 parsed["files"].is_array(),
6481 "serialized response must have a `files` JSON array; got: {parsed}"
6482 );
6483 }
6484
6485 /// D1: every file entry has an `lsp_location` field.
6486 ///
6487 /// Before D1, output was prose text; no `lsp_location` existed anywhere in the response.
6488 #[test]
6489 fn get_repo_map_each_file_has_lsp_location() {
6490 let (graph, _) = build_test_graph(5, false);
6491 let response = render_json(&graph, 50, None, false);
6492 for file in &response.files {
6493 assert!(
6494 !file.lsp_location.file_path.is_empty(),
6495 "each file must have a non-empty lsp_location.file_path"
6496 );
6497 }
6498 // Also verify through JSON.
6499 let json = serde_json::to_string(&response).expect("serialize");
6500 let parsed: serde_json::Value = serde_json::from_str(&json).expect("parse");
6501 for entry in parsed["files"].as_array().expect("files array") {
6502 assert!(
6503 entry["lsp_location"]["file_path"].is_string(),
6504 "each file entry must have lsp_location.file_path string; entry: {entry}"
6505 );
6506 }
6507 }
6508
6509 /// D1: every symbol has a `kind` (u32) and an `lsp_location`.
6510 ///
6511 /// Before D1 symbols were rendered as prose strings like `"function_item func_0"`.
6512 #[test]
6513 fn get_repo_map_each_symbol_has_kind_and_lsp_location() {
6514 let (graph, _) = build_test_graph(3, false);
6515 let response = render_json(&graph, 50, None, false);
6516 for file in &response.files {
6517 for sym in &file.symbols {
6518 assert!(
6519 sym.kind > 0,
6520 "symbol kind must be a positive LSP SymbolKind; got 0 for '{}'",
6521 sym.name
6522 );
6523 assert!(
6524 !sym.lsp_location.file_path.is_empty(),
6525 "symbol must have lsp_location.file_path"
6526 );
6527 }
6528 }
6529 // Verify through JSON: kind should be a number.
6530 let json = serde_json::to_string(&response).expect("serialize");
6531 let parsed: serde_json::Value = serde_json::from_str(&json).expect("parse");
6532 for file_entry in parsed["files"].as_array().expect("files") {
6533 for sym_entry in file_entry["symbols"].as_array().expect("symbols") {
6534 assert!(
6535 sym_entry["kind"].is_number(),
6536 "symbol `kind` must be a JSON number; sym: {sym_entry}"
6537 );
6538 assert!(
6539 sym_entry["lsp_location"]["file_path"].is_string(),
6540 "symbol must have lsp_location.file_path; sym: {sym_entry}"
6541 );
6542 }
6543 }
6544 }
6545
6546 /// D1: `calls` field is an array of `RepoMapCall`-shaped objects (each has
6547 /// `lsp_location` and `rank`).
6548 ///
6549 /// In 4.0.1 calls moved from bare `lsp_location` objects to `RepoMapCall`
6550 /// objects that carry both the target `lsp_location` and the target file's
6551 /// `base_rank`.
6552 #[test]
6553 fn get_repo_map_calls_field_is_array_of_lsp_locations() {
6554 // Build a 5-file star graph so file_0 has non-empty callees.
6555 let (graph, _) = build_test_graph(5, false);
6556 let response = render_json(&graph, 50, None, false);
6557 let json = serde_json::to_string(&response).expect("serialize");
6558 let parsed: serde_json::Value = serde_json::from_str(&json).expect("parse");
6559 for file_entry in parsed["files"].as_array().expect("files") {
6560 let calls = file_entry["calls"]
6561 .as_array()
6562 .expect("calls must be an array");
6563 for call in calls {
6564 // In 4.0.1 each call entry is a RepoMapCall with lsp_location + rank.
6565 assert!(
6566 call["lsp_location"]["file_path"].is_string(),
6567 "each call entry must have lsp_location.file_path string; call: {call}"
6568 );
6569 assert!(
6570 call["rank"].is_number(),
6571 "each call entry must have a numeric rank; call: {call}"
6572 );
6573 }
6574 }
6575 }
6576
6577 /// D2 / G3: `render_json_budgeted` with a very tight budget returns fewer files.
6578 ///
6579 /// Before the budget allocator, `max_files=3` controlled file count but not
6580 /// per-file expansion. In 4.0.1 the token_budget controls total bytes; with
6581 /// a budget of 1 token (= 4 bytes) only the envelope minimum allows any file
6582 /// at all, and the test verifies that the total_files counter still reflects
6583 /// the full eligible count. `render_json` (compat shim) passes a generous
6584 /// budget; use `render_json_budgeted` with a tight budget to verify the cap.
6585 #[test]
6586 fn get_repo_map_returns_at_most_max_files_files() {
6587 let (graph, _) = build_test_graph(10, false);
6588 // Use render_json_budgeted directly with a tight budget (600 bytes = 3 files
6589 // × 200-byte floor). Each file's envelope minimum is 200 bytes so a 600-byte
6590 // budget should admit at most 3 files.
6591 let response = render_json_budgeted(&graph, 150, None, false);
6592 assert!(
6593 response.files.len() <= 3,
6594 "files.len() = {} must be <= 3 for a 600-byte budget",
6595 response.files.len()
6596 );
6597 assert_eq!(
6598 response.total_files, 10,
6599 "total_files must reflect the full eligible count before budget cap"
6600 );
6601 assert!(
6602 response.capped,
6603 "capped must be true when total_files > files.len()"
6604 );
6605 }
6606
6607 /// D2: `include_metadata=false` (default) excludes JSON/TOML/etc. files.
6608 ///
6609 /// Before D2, JSON files with thousands of repeated keys dominated the
6610 /// output (Issue #5 — JSON-key flooding).
6611 #[test]
6612 fn get_repo_map_excludes_meta_by_default() {
6613 let (graph, _) = build_test_graph(3, /*include_json=*/ true);
6614 // Default: include_metadata = false
6615 let response = render_json(&graph, 50, None, false);
6616 for file in &response.files {
6617 assert!(
6618 !std::path::Path::new(&file.lsp_location.file_path)
6619 .extension()
6620 .is_some_and(|e| e.eq_ignore_ascii_case("json")),
6621 "JSON (Meta) files must be excluded when include_metadata=false; found: {}",
6622 file.lsp_location.file_path
6623 );
6624 }
6625 }
6626
6627 /// D2: `include_metadata=true` includes JSON files.
6628 ///
6629 /// Callers who opt-in to metadata should see all content kinds.
6630 #[test]
6631 fn get_repo_map_include_metadata_true_includes_json() {
6632 let (graph, _) = build_test_graph(3, /*include_json=*/ true);
6633 let response = render_json(&graph, 50, None, true);
6634 let has_json = response.files.iter().any(|f| {
6635 std::path::Path::new(&f.lsp_location.file_path)
6636 .extension()
6637 .is_some_and(|e| e.eq_ignore_ascii_case("json"))
6638 });
6639 assert!(
6640 has_json,
6641 "JSON file must be present when include_metadata=true"
6642 );
6643 }
6644
6645 /// J1/J2 MEASUREMENT: flask corpus focus_file=blueprints.py rank dispersion.
6646 ///
6647 /// Mandatory measurement from the 4.0.5 Wave-2 Front-C briefing:
6648 /// - `len(files) >= 8` (not collapsed to just the focus)
6649 /// - focus file rank is the highest in the response
6650 /// - next 5 files all have rank >= 10% of focus rank
6651 /// - neighborhood contains semantically related files (app.py, scaffold.py)
6652 #[test]
6653 #[ignore = "runs on flask corpus at tests/corpus/code/flask; use --ignored --nocapture"]
6654 #[expect(
6655 clippy::too_many_lines,
6656 reason = "end-to-end corpus measurement test; assertion sequence is sequential and cannot be meaningfully split"
6657 )]
6658 fn test_flask_focus_blueprints_rank_dispersion() {
6659 let corpus_root = Path::new(env!("CARGO_MANIFEST_DIR"))
6660 .parent()
6661 .unwrap()
6662 .parent()
6663 .unwrap()
6664 .join("tests/corpus/code/flask");
6665
6666 assert!(
6667 corpus_root.exists(),
6668 "flask corpus not found at {}",
6669 corpus_root.display()
6670 );
6671
6672 let graph = build_graph(&corpus_root).expect("build_graph on flask corpus");
6673 eprintln!("Flask corpus: {} files in graph", graph.files.len());
6674
6675 // Find focus file
6676 let focus_path = "src/flask/blueprints.py";
6677 let focus_idx = graph.files.iter().position(|f| f.path == focus_path);
6678 eprintln!("Focus file '{focus_path}' -> idx: {focus_idx:?}");
6679 assert!(
6680 focus_idx.is_some(),
6681 "blueprints.py not found in graph; available files: {:?}",
6682 graph
6683 .files
6684 .iter()
6685 .map(|f| &f.path)
6686 .take(20)
6687 .collect::<Vec<_>>()
6688 );
6689
6690 let response = render_json_budgeted(&graph, 4000, focus_idx, false);
6691
6692 // Criterion 1: at least 8 files returned.
6693 eprintln!(
6694 "Focused response: {} files (total_files={})",
6695 response.files.len(),
6696 response.total_files
6697 );
6698 assert!(
6699 response.files.len() >= 8,
6700 "expected >= 8 files in focused response; got {} — I#16 winner-take-all collapse",
6701 response.files.len()
6702 );
6703
6704 // Print top 10 for inspection.
6705 eprintln!("\nTop 10 focused files:");
6706 for (i, f) in response.files.iter().take(10).enumerate() {
6707 eprintln!(" [{i}] rank={:.6} {}", f.rank, f.lsp_location.file_path);
6708 }
6709
6710 // Criterion 2: focus file must appear near the top (top-3) of focused
6711 // results. With PERSONALIZATION_ALPHA=0.15 and the flask corpus,
6712 // src/flask/app.py has higher structural rank than blueprints.py and
6713 // may legitimately rank #1 — the focus boosts blueprints.py relative
6714 // to its unfocused position, but doesn't guarantee it beats every
6715 // structurally central neighbor. Being in top-3 confirms the bias
6716 // is working (pre-fix blueprints.py was #1 at 0.703 but that was a
6717 // degenerate collapse; now #1 or #2 is healthy).
6718 let focus_file_rank = response
6719 .files
6720 .iter()
6721 .find(|f| {
6722 f.lsp_location.file_path.contains("blueprints.py")
6723 && !f.lsp_location.file_path.contains("test_")
6724 && !f.lsp_location.file_path.contains("sansio")
6725 })
6726 .map(|f| f.rank)
6727 .unwrap_or(0.0);
6728 let focus_position = response
6729 .files
6730 .iter()
6731 .position(|f| {
6732 f.lsp_location.file_path.contains("blueprints.py")
6733 && !f.lsp_location.file_path.contains("test_")
6734 && !f.lsp_location.file_path.contains("sansio")
6735 })
6736 .unwrap_or(usize::MAX);
6737 eprintln!(
6738 "\nblueprinets.py position: #{} rank={:.6}",
6739 focus_position + 1,
6740 focus_file_rank
6741 );
6742 assert!(
6743 focus_position < 3,
6744 "blueprints.py must be in top-3 focused results (got position {}); \
6745 soft personalization must rebias toward focus neighborhood — I#16",
6746 focus_position + 1
6747 );
6748
6749 // Criterion 3: next 5 non-focus files have rank >= 10% of the top
6750 // file's rank. This is the core dispersion check: no more Dirac-delta
6751 // collapse where one file is 0.703 and all others are 0.003.
6752 let top_rank = response.files[0].rank;
6753 let non_focus_min_5 = response
6754 .files
6755 .iter()
6756 .filter(|f| {
6757 !(f.lsp_location.file_path.contains("blueprints.py")
6758 && !f.lsp_location.file_path.contains("test_")
6759 && !f.lsp_location.file_path.contains("sansio"))
6760 })
6761 .take(5)
6762 .map(|f| f.rank)
6763 .fold(f32::INFINITY, f32::min);
6764 let pct = non_focus_min_5 / top_rank * 100.0;
6765 eprintln!(
6766 "\nNext-5 (non-focus) min rank: {non_focus_min_5:.6} = {pct:.1}% of top rank {top_rank:.6}"
6767 );
6768 assert!(
6769 pct >= 10.0,
6770 "next-5 non-focus files min rank is {pct:.1}% of top rank (need ≥ 10%); \
6771 files are collapsing to near-zero floor — I#16"
6772 );
6773
6774 // Criterion 4: neighborhood quality — related files present.
6775 let related_names = ["app.py", "scaffold.py", "sansio"];
6776 let found_related: Vec<&str> = related_names
6777 .iter()
6778 .copied()
6779 .filter(|name| {
6780 response
6781 .files
6782 .iter()
6783 .any(|f| f.lsp_location.file_path.contains(name))
6784 })
6785 .collect();
6786 eprintln!("\nNeighborhood quality: found related files: {found_related:?}");
6787 // At least one related file should appear (soft assertion — log if missing).
6788 if found_related.is_empty() {
6789 eprintln!(
6790 "WARNING: no expected related files (app.py, scaffold.py) found in neighborhood"
6791 );
6792 }
6793 }
6794
6795 #[test]
6796 #[ignore = "runs on full ripvec codebase; use --nocapture to see output"]
6797 fn test_full_repo_map() {
6798 use std::time::Instant;
6799
6800 let root = Path::new(env!("CARGO_MANIFEST_DIR"))
6801 .parent()
6802 .unwrap()
6803 .parent()
6804 .unwrap();
6805
6806 // Phase 1: build_graph (walk + parse + import resolve + PageRank)
6807 let t0 = Instant::now();
6808 let graph = build_graph(root).expect("build_graph on ripvec root");
6809 let build_ms = t0.elapsed().as_secs_f64() * 1000.0;
6810
6811 // Phase 2: render (default, no focus)
6812 let t1 = Instant::now();
6813 let rendered = render(&graph, 2000, None);
6814 let render_ms = t1.elapsed().as_secs_f64() * 1000.0;
6815
6816 // Phase 3: render (topic-sensitive, focused on highest-ranked file)
6817 let t2 = Instant::now();
6818 let focus_idx = graph
6819 .base_ranks
6820 .iter()
6821 .enumerate()
6822 .max_by(|a, b| a.1.total_cmp(b.1))
6823 .map(|(i, _)| i);
6824 let focused = render(&graph, 2000, focus_idx);
6825 let focus_ms = t2.elapsed().as_secs_f64() * 1000.0;
6826
6827 eprintln!("\n=== Repo Map Performance ===");
6828 eprintln!(
6829 "Files: {}, Edges: {}, Defs: {}",
6830 graph.files.len(),
6831 graph.edges.len(),
6832 graph.files.iter().map(|f| f.defs.len()).sum::<usize>()
6833 );
6834 eprintln!("build_graph: {build_ms:.1}ms (walk + parse + resolve + PageRank)");
6835 eprintln!(
6836 "render(default): {render_ms:.3}ms ({} chars, ~{} tokens)",
6837 rendered.len(),
6838 rendered.len() / 4
6839 );
6840 eprintln!(
6841 "render(focused): {focus_ms:.3}ms ({} chars, ~{} tokens)",
6842 focused.len(),
6843 focused.len() / 4
6844 );
6845
6846 eprintln!("\nTop 5 by PageRank:");
6847 let mut ranked: Vec<(usize, f32)> = graph.base_ranks.iter().copied().enumerate().collect();
6848 ranked.sort_by(|a, b| b.1.total_cmp(&a.1));
6849 for (i, rank) in ranked.iter().take(5) {
6850 eprintln!(" {:.4} {}", rank, graph.files[*i].path);
6851 }
6852
6853 eprintln!("\n=== Default Render ===\n{rendered}");
6854 eprintln!(
6855 "\n=== Focused Render (on {}) ===\n{focused}",
6856 focus_idx
6857 .map(|i| graph.files[i].path.as_str())
6858 .unwrap_or("none")
6859 );
6860 }
6861
6862 // ── C1/C2 Tests (4.1.1): Python decorator-aware kind classification ──
6863
6864 /// `test:chunker_stores_property_decorator_kind` — a Python `@property`-
6865 /// decorated function produces a [`Definition`] with
6866 /// `decorator = Some("property")` and `lsp_kind_hint = Some(7)`.
6867 ///
6868 /// Behavior: trigger-fails-on-baseline-then-passes-post-fix.
6869 /// Baseline: `Definition` had no `decorator` / `lsp_kind_hint` fields.
6870 /// Post-fix: `extract_definitions` populates both from the AST.
6871 #[test]
6872 fn test_chunker_stores_property_decorator_kind() {
6873 let source = "@property\ndef foo(self):\n return self._x\n";
6874 let lang_cfg = crate::languages::config_for_extension("py").expect("python lang config");
6875 let defs = extract_definitions(source, &lang_cfg);
6876
6877 let foo = defs
6878 .iter()
6879 .find(|d| d.name == "foo")
6880 .expect("expected def 'foo'");
6881 assert_eq!(
6882 foo.decorator.as_deref(),
6883 Some("property"),
6884 "decorator must be Some(\"property\") for @property def; got {:?}",
6885 foo.decorator
6886 );
6887 assert_eq!(
6888 foo.lsp_kind_hint,
6889 Some(7),
6890 "lsp_kind_hint must be Some(7) (Property) for @property def; got {:?}",
6891 foo.lsp_kind_hint
6892 );
6893 }
6894
6895 /// `test:chunker_stores_classmethod_decorator_kind` — a Python `@classmethod`-
6896 /// decorated function produces a [`Definition`] with
6897 /// `decorator = Some("classmethod")` and `lsp_kind_hint = Some(12)`.
6898 #[test]
6899 fn test_chunker_stores_classmethod_decorator_kind() {
6900 let source = "@classmethod\ndef from_dict(cls, d):\n pass\n";
6901 let lang_cfg = crate::languages::config_for_extension("py").expect("python lang config");
6902 let defs = extract_definitions(source, &lang_cfg);
6903
6904 let from_dict = defs
6905 .iter()
6906 .find(|d| d.name == "from_dict")
6907 .expect("expected def 'from_dict'");
6908 assert_eq!(
6909 from_dict.decorator.as_deref(),
6910 Some("classmethod"),
6911 "decorator must be Some(\"classmethod\"); got {:?}",
6912 from_dict.decorator
6913 );
6914 assert_eq!(
6915 from_dict.lsp_kind_hint,
6916 Some(12),
6917 "lsp_kind_hint must be Some(12) (Function) for @classmethod def; got {:?}",
6918 from_dict.lsp_kind_hint
6919 );
6920 }
6921
6922 /// `test:chunker_stores_arbitrary_decorator_name` — a Python
6923 /// `@functools.lru_cache`-decorated function (attribute-access decorator)
6924 /// produces a [`Definition`] with `decorator = Some("functools.lru_cache")`
6925 /// and `lsp_kind_hint = Some(12)`.
6926 #[test]
6927 fn test_chunker_stores_arbitrary_decorator_name() {
6928 let source = "@functools.lru_cache\ndef expensive(n):\n pass\n";
6929 let lang_cfg = crate::languages::config_for_extension("py").expect("python lang config");
6930 let defs = extract_definitions(source, &lang_cfg);
6931
6932 let expensive = defs
6933 .iter()
6934 .find(|d| d.name == "expensive")
6935 .expect("expected def 'expensive'");
6936 // Attribute decorators are stored as dotted-name text.
6937 assert_eq!(
6938 expensive.decorator.as_deref(),
6939 Some("functools.lru_cache"),
6940 "decorator must be Some(\"functools.lru_cache\"); got {:?}",
6941 expensive.decorator
6942 );
6943 assert_eq!(
6944 expensive.lsp_kind_hint,
6945 Some(12),
6946 "lsp_kind_hint must be Some(12) (Function) for @functools.lru_cache def; got {:?}",
6947 expensive.lsp_kind_hint
6948 );
6949 }
6950
6951 /// `test:repo_map_projection_uses_stored_kind_for_python_decorator` — the
6952 /// `render_json_budgeted` projection site reads `lsp_kind_hint` from the
6953 /// stored [`Definition`] rather than re-computing via `lsp_symbol_kind_for_node_kind`.
6954 ///
6955 /// A synthetic [`RepoGraph`] with a Definition whose `kind = "decorated_definition"`
6956 /// (which would map to Property=7 via the AST-less path) but
6957 /// `lsp_kind_hint = Some(12)` must produce a [`RepoMapSymbol`] with `kind = 12`.
6958 #[test]
6959 fn test_repo_map_projection_uses_stored_kind_for_python_decorator() {
6960 // Build a synthetic graph: one file, one @classmethod-decorated function.
6961 // kind = "decorated_definition" (AST-less → 7 = Property, WRONG).
6962 // lsp_kind_hint = Some(12) (stored at parse time, CORRECT).
6963 let files = vec![FileNode {
6964 path: "module.py".to_string(),
6965 defs: vec![Definition {
6966 name: "from_dict".to_string(),
6967 kind: "decorated_definition".to_string(),
6968 start_line: 1,
6969 end_line: 3,
6970 scope: String::new(),
6971 signature: None,
6972 start_byte: 0,
6973 end_byte: 60,
6974 calls: vec![],
6975 decorator: Some("classmethod".to_string()),
6976 lsp_kind_hint: Some(12),
6977 }],
6978 imports: vec![],
6979 }];
6980 let graph = build_graph_from_files_pub(files);
6981 let result = render_json_budgeted(&graph, 4000, None, false);
6982
6983 // Locate the symbol in the result.
6984 let file = result
6985 .files
6986 .iter()
6987 .find(|f| f.lsp_location.file_path.contains("module.py"))
6988 .expect("module.py must appear in render output");
6989 let sym = file
6990 .symbols
6991 .iter()
6992 .find(|s| s.name == "from_dict")
6993 .expect("from_dict must appear as a symbol");
6994 assert_eq!(
6995 sym.kind, 12,
6996 "C2: lsp_kind_hint=12 must override the AST-less kind (7 for decorated_definition); got {}",
6997 sym.kind
6998 );
6999 }
7000}