ripvec_core/repo_map.rs
1//! `PageRank`-weighted structural overview of a codebase.
2//!
3//! Builds a dependency graph from tree-sitter definition and import extraction,
4//! ranks files by importance using `PageRank` (standard or topic-sensitive), and
5//! renders a budget-constrained overview with tiered detail levels.
6
7use std::collections::{HashMap, HashSet};
8use std::fmt::Write as _;
9use std::path::{Path, PathBuf};
10
11use rayon::prelude::*;
12use rkyv::{Archive, Deserialize as RkyvDeserialize, Serialize as RkyvSerialize};
13use streaming_iterator::StreamingIterator;
14use tree_sitter::{Parser, Query, QueryCursor};
15
16use serde::Serialize;
17
18use crate::chunk::ContentKind;
19use crate::languages;
20use crate::walk;
21
22/// Serialize a `ContentKind` to a lowercase string tag for JSON output.
23fn content_kind_tag(ck: ContentKind) -> &'static str {
24 match ck {
25 ContentKind::Code => "code",
26 ContentKind::Docs => "docs",
27 ContentKind::Meta => "meta",
28 }
29}
30
31// ── Data Structures ──────────────────────────────────────────────────
32
33/// Persisted dependency graph with `PageRank` scores.
34#[derive(Debug, Clone, Archive, RkyvSerialize, RkyvDeserialize)]
35pub struct RepoGraph {
36 /// Files in the repository with definitions, imports, and calls.
37 pub files: Vec<FileNode>,
38 /// File-level edges (derived from def-level call edges).
39 pub edges: Vec<(u32, u32, u32)>,
40 /// File-level `PageRank` scores (aggregated from def-level).
41 pub base_ranks: Vec<f32>,
42 /// File-level callers (indices into `files`).
43 pub callers: Vec<Vec<u32>>,
44 /// File-level callees (indices into `files`).
45 pub callees: Vec<Vec<u32>>,
46 /// Definition-level call edges: `(caller_def, callee_def, weight)`.
47 pub def_edges: Vec<(DefId, DefId, u32)>,
48 /// Definition-level `PageRank` scores (flattened: `offsets[file_idx] + def_idx`).
49 pub def_ranks: Vec<f32>,
50 /// Definition-level callers (flattened, parallel to `def_ranks`).
51 pub def_callers: Vec<Vec<DefId>>,
52 /// Definition-level callees (flattened, parallel to `def_ranks`).
53 pub def_callees: Vec<Vec<DefId>>,
54 /// Prefix-sum offsets for flattening `DefId` to linear index.
55 pub def_offsets: Vec<usize>,
56 /// Auto-tuned alpha for search boost.
57 pub alpha: f32,
58}
59
60/// A file in the repository with its definitions and imports.
61#[derive(Debug, Clone, Archive, RkyvSerialize, RkyvDeserialize)]
62pub struct FileNode {
63 /// Relative path from the repository root.
64 pub path: String,
65 /// Definitions (functions, structs, classes, etc.) extracted from this file.
66 pub defs: Vec<Definition>,
67 /// Import references extracted from this file.
68 pub imports: Vec<ImportRef>,
69}
70
71/// A definition extracted from a source file.
72#[derive(Debug, Clone, Archive, RkyvSerialize, RkyvDeserialize)]
73pub struct Definition {
74 /// Name of the definition (e.g., function name, class name).
75 pub name: String,
76 /// Kind of syntax node (e.g., `function_item`, `class_definition`).
77 pub kind: String,
78 /// 1-based start line number.
79 pub start_line: u32,
80 /// 1-based end line number.
81 pub end_line: u32,
82 /// Scope chain (e.g., `"impl_item Foo > fn bar"`).
83 pub scope: String,
84 /// Function/method signature, if available.
85 pub signature: Option<String>,
86 /// Byte offset of this definition's start in the source file.
87 pub start_byte: u32,
88 /// Byte offset of this definition's end in the source file.
89 pub end_byte: u32,
90 /// Call sites within this definition's body.
91 pub calls: Vec<CallRef>,
92}
93
94/// An import reference extracted from a source file.
95#[derive(Debug, Clone, Archive, RkyvSerialize, RkyvDeserialize)]
96pub struct ImportRef {
97 /// Raw import path as written in source (e.g., `crate::foo::bar`).
98 pub raw_path: String,
99 /// Resolved file index in [`RepoGraph::files`], if resolution succeeded.
100 pub resolved_idx: Option<u32>,
101}
102
103/// Unique identifier for a definition: (file index, definition index within file).
104pub type DefId = (u32, u16);
105
106/// A call site extracted from a definition body.
107#[derive(Debug, Clone, Default, Archive, RkyvSerialize, RkyvDeserialize)]
108pub struct CallRef {
109 /// Callee function/method name (bare, without qualifier).
110 ///
111 /// For scoped calls like `mod_a::foo()`, this is `"foo"`.
112 /// For bare calls like `foo()`, this is `"foo"`.
113 pub name: String,
114 /// Full qualified path for scoped calls, e.g. `Some("mod_a::foo")`.
115 ///
116 /// `None` for bare (unqualified) calls. When `Some`, `resolve_calls`
117 /// uses this for qualifier-based module disambiguation before falling
118 /// back to the bare `name`.
119 pub qualified_path: Option<String>,
120 /// Receiver type for method calls, inferred from local context.
121 ///
122 /// Set to `Some("Foo")` when:
123 /// - The call is `self.method()` inside `impl Foo { … }`.
124 /// - The call is `x.method()` where `x` has an explicit type annotation `x: Foo`.
125 /// - The call is `x.method()` after `let x = Foo::new()`.
126 ///
127 /// `None` for free function calls, or when the receiver type cannot be
128 /// inferred from local context alone. When `Some`, `resolve_calls` prefers
129 /// defs whose enclosing impl scope matches the receiver type.
130 pub receiver_type: Option<String>,
131 /// Byte offset of the call in the source file (for scoping to definitions).
132 pub byte_offset: u32,
133 /// Resolved target definition, if resolution succeeded.
134 pub resolved: Option<DefId>,
135}
136
137// ── JSON output types ────────────────────────────────────────────────
138
139/// LSP-shaped location pointing at a file or symbol within a file.
140///
141/// Lines and characters are 0-based, matching the Language Server Protocol
142/// convention so callers can pass this directly to LSP tools without any
143/// conversion.
144#[derive(Debug, Clone, Serialize)]
145pub struct RepoMapLspLocation {
146 /// Relative path from the repository root (prefixed with `./`).
147 pub file_path: String,
148 /// 0-based start line.
149 pub start_line: usize,
150 /// 0-based start character (0 for file-level locations).
151 pub start_character: usize,
152 /// 0-based end line (equals `start_line` for file-level locations).
153 pub end_line: usize,
154 /// 0-based end character (0 for file-level locations).
155 pub end_character: usize,
156}
157
158/// A top-level symbol extracted from a file in the repository map.
159///
160/// Analogous to an LSP `DocumentSymbol` but limited to the fields available
161/// from tree-sitter definition extraction. The `rank` field carries the
162/// definition-level `PageRank` score from [`RepoGraph::def_ranks`], enabling
163/// callers to prioritise symbols by structural importance.
164#[derive(Debug, Clone, Serialize)]
165pub struct RepoMapSymbol {
166 /// Symbol name (function name, struct name, etc.).
167 pub name: String,
168 /// LSP `SymbolKind` as a decimal — use the same values as
169 /// `lsp_workspace_symbols` and `lsp_document_symbols`.
170 pub kind: u32,
171 /// Location pointing at the symbol's definition line (0-based).
172 pub lsp_location: RepoMapLspLocation,
173 /// Definition-level `PageRank` score from [`RepoGraph::def_ranks`].
174 ///
175 /// Higher values indicate definitions that are called by many other
176 /// definitions. Used by the token-budget allocator to decide which
177 /// symbols to include when the per-file budget is constrained.
178 pub rank: f32,
179}
180
181/// An outgoing call-edge from a file to another file.
182///
183/// Carries both the target file's `lsp_location` and its `base_rank`
184/// (file-level `PageRank` score) so callers can decide how important
185/// each dependency is without a separate lookup.
186#[derive(Debug, Clone, Serialize)]
187pub struct RepoMapCall {
188 /// Location pointing at the target file (line 0, character 0).
189 pub lsp_location: RepoMapLspLocation,
190 /// File-level `PageRank` score of the target file.
191 pub rank: f32,
192}
193
194/// One file entry in the JSON repo map.
195///
196/// Carries the file's `PageRank` score, content kind, outgoing call-edges to
197/// other files, and the file's top-level symbol definitions — all with
198/// `lsp_location` so the caller can chain directly into LSP tools without
199/// any destructuring.
200#[derive(Debug, Clone, Serialize)]
201pub struct RepoMapFile {
202 /// Location pointing at the file itself (line 0, character 0).
203 ///
204 /// Pass `lsp_location.file_path` directly into `lsp_document_symbols` or
205 /// any other file-scoped tool.
206 pub lsp_location: RepoMapLspLocation,
207 /// `PageRank` score in [0, 1] (higher = more structurally central).
208 pub rank: f32,
209 /// Content classification: `"code"`, `"docs"`, or `"meta"`.
210 ///
211 /// Serialized as a lowercase string tag so JSON consumers can branch
212 /// without numeric magic values. Mirrors the `ContentKind` enum in
213 /// `ripvec-core::chunk`.
214 pub content_kind: &'static str,
215 /// Outgoing call-edges sorted by target file `PageRank` descending.
216 pub calls: Vec<RepoMapCall>,
217 /// Top-level definitions extracted from this file by tree-sitter,
218 /// sorted by definition-level `PageRank` descending and pruned to
219 /// the per-file token-budget allocation.
220 pub symbols: Vec<RepoMapSymbol>,
221 /// Number of symbols that were omitted due to budget exhaustion or
222 /// logarithmic attenuation cutoff. `truncated_symbols + symbols.len()`
223 /// equals the total definition count for the file.
224 pub truncated_symbols: usize,
225 /// Number of call-edges that were omitted due to budget exhaustion
226 /// or logarithmic attenuation cutoff. `truncated_calls + calls.len()`
227 /// equals the total callee count for the file.
228 pub truncated_calls: usize,
229}
230
231/// JSON-mode response envelope for `get_repo_map` (4.0.1 shape).
232///
233/// Replaces the `max_files`-capped shape from 4.0.0. The caller supplies a
234/// `token_budget`; files are allocated bytes proportional to their `PageRank`
235/// (40% cap per file, 200-byte envelope floor). Symbols are filled in
236/// def-rank order with a logarithmic attenuation cutoff. Leftover bytes
237/// cascade to subsequent files.
238///
239/// The `estimated_bytes`, `budget_bytes`, and `budget_exhausted` fields give
240/// callers real-time feedback on how tightly the budget was consumed.
241#[derive(Debug, Clone, Serialize)]
242pub struct GetRepoMapResponse {
243 /// Files sorted by `PageRank` descending, pruned to the token budget.
244 pub files: Vec<RepoMapFile>,
245 /// Total number of eligible files in the graph (pre-allocation).
246 ///
247 /// If `total_files > files.len()`, the budget ran out before all files
248 /// could be included. Read `budget_exhausted` directly for the boolean.
249 pub total_files: usize,
250 /// Actual serialised-JSON byte count for all returned content.
251 pub estimated_bytes: usize,
252 /// Budget ceiling in bytes that was used for allocation
253 /// (`token_budget * 4`).
254 pub budget_bytes: usize,
255 /// `true` when `total_files > files.len()` (budget was exhausted before
256 /// all eligible files were included).
257 pub budget_exhausted: bool,
258 /// Retained for backward compatibility with 4.0.0 callers that checked
259 /// `capped`. Equivalent to `budget_exhausted`.
260 pub capped: bool,
261}
262
263// ── Constants ────────────────────────────────────────────────────────
264
265/// `PageRank` damping factor.
266const DAMPING: f32 = 0.85;
267
268/// `PageRank` convergence threshold.
269const EPSILON: f32 = 1e-6;
270
271/// Maximum `PageRank` iterations.
272const MAX_ITERATIONS: usize = 100;
273
274/// Maximum callers/callees stored per file.
275const MAX_NEIGHBORS: usize = 5;
276
277/// Approximate characters per token for budget estimation.
278const CHARS_PER_TOKEN: usize = 4;
279
280/// Concentration mass placed on the focus node in topic-sensitive `PageRank`.
281///
282/// Following Haveliwala 2002 ("Topic-Sensitive PageRank"), the personalization
283/// vector places a small bias `α` on the focus node and distributes the
284/// remaining `1 - α` uniformly over all other nodes. This preserves rank
285/// dispersion across the corpus — the user sees a *neighborhood* of related
286/// files rebiased toward the focus, not a Dirac delta on the focus node with
287/// every other file collapsed to an equally negligible uniform floor.
288///
289/// Value 0.15 means:
290/// - focus node teleportation probability = 0.15
291/// - each of the (n - 1) other nodes = 0.85 / (n - 1)
292///
293/// Prior to 4.0.5, this constant was effectively 0.70 (70% bias on focus),
294/// which caused winner-take-all collapse observed on the flask and ripvec
295/// corpora (I#16).
296const PERSONALIZATION_ALPHA: f32 = 0.15;
297
298// ── Import Queries ───────────────────────────────────────────────────
299
300/// Compile a tree-sitter import query for the given extension.
301///
302/// Returns `None` for unsupported extensions.
303fn import_query_for_extension(ext: &str) -> Option<(tree_sitter::Language, Query)> {
304 let (lang, query_str): (tree_sitter::Language, &str) = match ext {
305 "rs" => (
306 tree_sitter_rust::LANGUAGE.into(),
307 "(use_declaration) @import",
308 ),
309 "py" | "pyi" => (
310 tree_sitter_python::LANGUAGE.into(),
311 concat!(
312 "(import_statement) @import\n",
313 "(import_from_statement) @import",
314 ),
315 ),
316 "js" | "jsx" => (
317 tree_sitter_javascript::LANGUAGE.into(),
318 "(import_statement source: (string) @import_path) @import",
319 ),
320 "ts" => (
321 tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into(),
322 "(import_statement source: (string) @import_path) @import",
323 ),
324 "tsx" => (
325 tree_sitter_typescript::LANGUAGE_TSX.into(),
326 "(import_statement source: (string) @import_path) @import",
327 ),
328 "go" => (
329 tree_sitter_go::LANGUAGE.into(),
330 "(import_spec path: (interpreted_string_literal) @import_path) @import",
331 ),
332 // Ruby: require statements.
333 "rb" => (
334 tree_sitter_ruby::LANGUAGE.into(),
335 "(call method: (identifier) @_method arguments: (argument_list (string (string_content) @import_path)) (#eq? @_method \"require\")) @import",
336 ),
337 _ => return None,
338 };
339 let query = match Query::new(&lang, query_str) {
340 Ok(q) => q,
341 Err(e) => {
342 tracing::warn!(ext, %e, "import query compilation failed — language may be ABI-incompatible");
343 return None;
344 }
345 };
346 Some((lang, query))
347}
348
349/// Extract import paths from source using tree-sitter.
350fn extract_imports(
351 source: &str,
352 lang: &tree_sitter::Language,
353 import_query: &Query,
354) -> Vec<String> {
355 let mut parser = Parser::new();
356 if parser.set_language(lang).is_err() {
357 return vec![];
358 }
359 let Some(tree) = parser.parse(source, None) else {
360 return vec![];
361 };
362
363 let mut cursor = QueryCursor::new();
364 let mut imports = Vec::new();
365 let mut matches = cursor.matches(import_query, tree.root_node(), source.as_bytes());
366
367 while let Some(m) = matches.next() {
368 // Prefer @import_path capture (JS/TS/Go), fall back to full @import text
369 let mut import_path_text = None;
370 let mut import_text = None;
371
372 for cap in m.captures {
373 let cap_name = &import_query.capture_names()[cap.index as usize];
374 let text = &source[cap.node.start_byte()..cap.node.end_byte()];
375 if *cap_name == "import_path" {
376 import_path_text = Some(text.trim_matches(|c| c == '"' || c == '\''));
377 } else if *cap_name == "import" {
378 import_text = Some(text);
379 }
380 }
381
382 if let Some(path) = import_path_text {
383 imports.push(path.to_string());
384 } else if let Some(text) = import_text {
385 imports.push(text.to_string());
386 }
387 }
388
389 imports
390}
391
392// ── Import Resolution ────────────────────────────────────────────────
393
394/// Resolve a Rust `use` path to a file index in the file map.
395///
396/// Handles `crate::`, `self::`, and `super::` prefixes. External crate
397/// imports are dropped (returns `None`).
398fn resolve_rust_import(
399 raw: &str,
400 file_path: &Path,
401 root: &Path,
402 file_index: &HashMap<PathBuf, usize>,
403) -> Option<usize> {
404 // Extract the module path from `use crate::foo::bar;` or `use crate::foo::bar::Baz;`
405 let trimmed = raw
406 .trim()
407 .trim_start_matches("use ")
408 .trim_end_matches(';')
409 .trim();
410
411 let segments: Vec<&str> = trimmed.split("::").collect();
412 if segments.is_empty() {
413 return None;
414 }
415
416 // Determine the base directory and skip prefix segments
417 let (base, skip) = match segments[0] {
418 "crate" => {
419 // Find the nearest Cargo.toml ancestor to determine the crate root.
420 // In a workspace, `crate::foo` resolves relative to the crate's src/,
421 // not the workspace root.
422 let mut dir = file_path.parent();
423 let crate_root = loop {
424 match dir {
425 Some(d) if d.join("Cargo.toml").exists() => break d.join("src"),
426 Some(d) => dir = d.parent(),
427 None => break root.join("src"), // fallback
428 }
429 };
430 (crate_root, 1)
431 }
432 "self" => {
433 let dir = file_path.parent()?;
434 (dir.to_path_buf(), 1)
435 }
436 "super" => {
437 let dir = file_path.parent()?.parent()?;
438 (dir.to_path_buf(), 1)
439 }
440 // External crate — drop
441 _ => return None,
442 };
443
444 // Build candidate paths from the remaining segments.
445 // Try progressively shorter prefixes since the last segments
446 // may be items (struct, fn) rather than modules.
447 let path_segments = &segments[skip..];
448 for end in (1..=path_segments.len()).rev() {
449 let mut candidate = base.clone();
450 for seg in &path_segments[..end] {
451 // Strip glob patterns like `{Foo, Bar}`
452 let clean = seg.split('{').next().unwrap_or(seg).trim();
453 if !clean.is_empty() {
454 candidate.push(clean);
455 }
456 }
457
458 // Try file.rs
459 let as_file = candidate.with_extension("rs");
460 if let Some(&idx) = file_index.get(&as_file) {
461 return Some(idx);
462 }
463
464 // Try dir/mod.rs
465 let as_mod = candidate.join("mod.rs");
466 if let Some(&idx) = file_index.get(&as_mod) {
467 return Some(idx);
468 }
469 }
470
471 None
472}
473
474/// Resolve an import path to a file index based on file extension.
475fn resolve_import(
476 raw: &str,
477 ext: &str,
478 file_path: &Path,
479 root: &Path,
480 file_index: &HashMap<PathBuf, usize>,
481) -> Option<usize> {
482 match ext {
483 "rs" => resolve_rust_import(raw, file_path, root, file_index),
484 "py" | "pyi" => resolve_python_import(raw, root, file_index),
485 "js" | "jsx" | "ts" | "tsx" => resolve_js_import(raw, file_path, file_index),
486 // Go imports use full package paths — skip local resolution
487 _ => None,
488 }
489}
490
491/// Resolve a Python import to a file index.
492///
493/// Handles `import foo.bar` and `from foo.bar import baz` patterns.
494fn resolve_python_import(
495 raw: &str,
496 root: &Path,
497 file_index: &HashMap<PathBuf, usize>,
498) -> Option<usize> {
499 let module_path = if let Some(rest) = raw.strip_prefix("from ") {
500 rest.split_whitespace().next()?
501 } else if let Some(rest) = raw.strip_prefix("import ") {
502 rest.split_whitespace().next()?
503 } else {
504 return None;
505 };
506
507 let rel_path: PathBuf = module_path.split('.').collect();
508 for ext in ["py", "pyi"] {
509 let as_file = root.join(&rel_path).with_extension(ext);
510 if let Some(&idx) = file_index.get(&as_file) {
511 return Some(idx);
512 }
513 }
514
515 for init_name in ["__init__.py", "__init__.pyi"] {
516 let as_init = root.join(&rel_path).join(init_name);
517 if let Some(&idx) = file_index.get(&as_init) {
518 return Some(idx);
519 }
520 }
521
522 None
523}
524
525/// Resolve a JS/TS import to a file index.
526///
527/// Handles relative paths like `./foo` or `../bar`.
528fn resolve_js_import(
529 raw: &str,
530 file_path: &Path,
531 file_index: &HashMap<PathBuf, usize>,
532) -> Option<usize> {
533 if !raw.starts_with('.') {
534 return None;
535 }
536
537 let dir = file_path.parent()?;
538 let candidate = dir.join(raw);
539
540 for ext in &["js", "jsx", "ts", "tsx"] {
541 let with_ext = candidate.with_extension(ext);
542 if let Some(&idx) = file_index.get(&with_ext) {
543 return Some(idx);
544 }
545 }
546
547 for ext in &["js", "jsx", "ts", "tsx"] {
548 let index_file = candidate.join("index").with_extension(ext);
549 if let Some(&idx) = file_index.get(&index_file) {
550 return Some(idx);
551 }
552 }
553
554 None
555}
556
557// ── Extraction ───────────────────────────────────────────────────────
558
559/// Extract definitions from a source file using tree-sitter.
560fn extract_definitions(source: &str, config: &languages::LangConfig) -> Vec<Definition> {
561 let mut parser = Parser::new();
562 if parser.set_language(&config.language).is_err() {
563 return vec![];
564 }
565 let Some(tree) = parser.parse(source, None) else {
566 return vec![];
567 };
568
569 let mut cursor = QueryCursor::new();
570 let mut defs = Vec::new();
571 let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
572
573 while let Some(m) = matches.next() {
574 let mut name = String::new();
575 let mut def_node = None;
576
577 for cap in m.captures {
578 let cap_name = &config.query.capture_names()[cap.index as usize];
579 if *cap_name == "name" {
580 name = source[cap.node.start_byte()..cap.node.end_byte()].to_string();
581 } else if *cap_name == "def" {
582 def_node = Some(cap.node);
583 }
584 }
585
586 if let Some(node) = def_node {
587 let scope = crate::chunk::build_scope_chain(node, source);
588 let signature = crate::chunk::extract_signature(node, source);
589 #[expect(clippy::cast_possible_truncation, reason = "line numbers fit in u32")]
590 let start_line = node.start_position().row as u32 + 1;
591 #[expect(clippy::cast_possible_truncation, reason = "line numbers fit in u32")]
592 let end_line = node.end_position().row as u32 + 1;
593 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
594 let start_byte = node.start_byte() as u32;
595 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
596 let end_byte = node.end_byte() as u32;
597 defs.push(Definition {
598 name,
599 kind: node.kind().to_string(),
600 start_line,
601 end_line,
602 scope,
603 signature,
604 start_byte,
605 end_byte,
606 calls: vec![],
607 });
608 }
609 }
610
611 defs
612}
613
614// ── Call Extraction & Resolution ────────────────────────────────────
615
616/// Tiebreak priority for def attribution when two defs share the same byte span.
617///
618/// Returns `0` for function-like defs (lowest value = wins in `min_by_key`) and
619/// `1` for structural container defs (class bodies, impl blocks, etc.).
620///
621/// This resolves the Python case where the class body `block` and the first
622/// `function_definition` inside it occupy identical byte ranges; calls inside
623/// the function body should be attributed to the function, not the class block.
624fn is_callable_def_priority(kind: &str) -> u8 {
625 match kind {
626 // Function / method defs: these are the correct attribution targets.
627 "function_item"
628 | "function_definition"
629 | "function_declaration"
630 | "function_signature_item"
631 | "method_definition"
632 | "method_declaration"
633 | "method" => 0,
634 // Structural containers: class body blocks, impl items, etc.
635 // Prefer function-like defs over these when byte ranges tie.
636 _ => 1,
637 }
638}
639
640/// Extract call sites from a source file and assign them to definitions.
641///
642/// Uses the language's call query to find all call expressions, then
643/// assigns each call to the definition whose byte range contains it.
644/// Calls outside any definition body (module-level) are ignored.
645///
646/// For Rust scoped calls (`a::b::foo()`), the `@callee` capture returns the
647/// full `scoped_identifier` node. This function splits it into:
648/// - `name` = bare trailing identifier (`"foo"`)
649/// - `qualified_path` = `Some("a::b::foo")` for disambiguation in `resolve_calls`.
650///
651/// For method calls (`x.method()`), `receiver_type` is inferred from local
652/// context (parameter annotations, let-bindings, impl blocks). See
653/// [`infer_receiver_types`] for the heuristic.
654fn extract_calls(source: &str, call_config: &languages::CallConfig, defs: &mut [Definition]) {
655 let mut parser = Parser::new();
656 if parser.set_language(&call_config.language).is_err() {
657 return;
658 }
659 let Some(tree) = parser.parse(source, None) else {
660 return;
661 };
662
663 // Build receiver-type map: byte_offset_of_call → receiver_type_string.
664 // Done once per file to amortise the tree walk cost.
665 let receiver_map = infer_receiver_types(source, &tree, &call_config.language);
666
667 // HCL: run the HCL-specific call-edge extractor as a post-pass so the
668 // terraform_remote_state references and module blocks contribute edges
669 // that the generic function_call query cannot capture (R2 + R3, Wave 3).
670 if languages::is_hcl_language(&call_config.language) {
671 extract_hcl_call_edges(source, tree.root_node(), defs);
672 }
673
674 let mut cursor = QueryCursor::new();
675 let mut matches = cursor.matches(&call_config.query, tree.root_node(), source.as_bytes());
676
677 while let Some(m) = matches.next() {
678 let mut full_callee_text = None;
679 let mut call_byte = 0u32;
680
681 for cap in m.captures {
682 let cap_name = &call_config.query.capture_names()[cap.index as usize];
683 if *cap_name == "callee" {
684 full_callee_text =
685 Some(source[cap.node.start_byte()..cap.node.end_byte()].to_string());
686 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
687 {
688 call_byte = cap.node.start_byte() as u32;
689 }
690 }
691 }
692
693 if let Some(full_text) = full_callee_text {
694 // Split qualified path into bare name + optional qualifier.
695 let (name, qualified_path) = if full_text.contains("::") {
696 let bare = full_text
697 .rsplit("::")
698 .next()
699 .unwrap_or(&full_text)
700 .to_string();
701 (bare, Some(full_text))
702 } else {
703 (full_text, None)
704 };
705
706 // Look up receiver type from the pre-built map.
707 let receiver_type = receiver_map.get(&call_byte).cloned();
708
709 // Assign to the most-specific (smallest byte range) enclosing definition.
710 // Using `find` (first match) was incorrect for nested defs: an `impl_item`
711 // wrapping a `function_item` both contain the call site, but the
712 // `function_item` is the correct granularity for method attribution.
713 //
714 // Tiebreak: when two defs have equal byte spans (as happens in Python where
715 // the class body `block` and its first `function_definition` share the same
716 // start/end bytes), prefer function-like defs over structural container defs.
717 // `is_callable_def` returns 0 for function-like kinds (sorts first in min_by_key).
718 let enclosing_idx = defs
719 .iter()
720 .enumerate()
721 .filter(|(_, d)| d.start_byte <= call_byte && call_byte < d.end_byte)
722 .min_by_key(|(_, d)| (d.end_byte - d.start_byte, is_callable_def_priority(&d.kind)))
723 .map(|(i, _)| i);
724
725 if let Some(idx) = enclosing_idx {
726 // Skip self-recursive calls (compare bare name to def name).
727 if defs[idx].name != name {
728 defs[idx].calls.push(CallRef {
729 name,
730 qualified_path,
731 receiver_type,
732 byte_offset: call_byte,
733 resolved: None,
734 });
735 }
736 }
737 // Calls outside any definition are ignored (module-level init).
738 }
739 }
740}
741
742// HCL: post-pass call-edge extraction for terraform_remote_state and module
743// blocks. These are not function calls — they are HCL-specific structural
744// references to other Terraform modules — so the generic
745// `(function_call (identifier) @callee) @call` pattern in
746// `call_query_for_extension("tf")` cannot capture them. This helper runs
747// once per HCL file inside `extract_calls` (R2 + R3, Wave 3).
748
749/// Walk an HCL parse tree and emit CallRef entries for:
750///
751/// 1. `data.terraform_remote_state.<NAME>.outputs.<ATTR>` expressions:
752/// one CallRef per reference, with `name = NAME` and
753/// `qualified_path = Some("terraform_remote_state.NAME")`. These
754/// connect the current file to the named remote-state module's outputs.
755///
756/// 2. `module "X" { source = "../X" }` blocks: one CallRef with
757/// `name = X` (the label) and `qualified_path = Some("module.X")`.
758/// The module reference connects to the module's directory in
759/// `resolve_import` (HCL module-source resolution is not implemented
760/// yet — the qualified_path carrier is the contract; resolve adds the
761/// file lookup).
762///
763/// Each emitted CallRef is attached to the smallest enclosing definition
764/// by byte range — matching the same heuristic used by `extract_calls`.
765fn extract_hcl_call_edges(source: &str, root: tree_sitter::Node<'_>, defs: &mut [Definition]) {
766 // Walk all named descendants iteratively.
767 let mut stack: Vec<tree_sitter::Node<'_>> = vec![root];
768 while let Some(node) = stack.pop() {
769 // Defer to a function-style helper per node kind.
770 match node.kind() {
771 "expression" => hcl_visit_expression(source, node, defs),
772 "block" => hcl_visit_block(source, node, defs),
773 _ => {}
774 }
775 // Recurse into named children.
776 let mut cursor = node.walk();
777 for child in node.children(&mut cursor) {
778 if child.is_named() {
779 stack.push(child);
780 }
781 }
782 }
783}
784
785/// Inspect an HCL `expression` node for the
786/// `data.terraform_remote_state.<NAME>.outputs.<ATTR>` reference pattern.
787///
788/// The expression tree looks like:
789/// ```text
790/// expression
791/// variable_expr
792/// identifier "data"
793/// get_attr
794/// identifier "terraform_remote_state"
795/// get_attr
796/// identifier "<NAME>"
797/// get_attr
798/// identifier "outputs"
799/// get_attr
800/// identifier "<ATTR>"
801/// ```
802fn hcl_visit_expression(source: &str, node: tree_sitter::Node<'_>, defs: &mut [Definition]) {
803 // Collect children: must be `variable_expr` (with identifier="data")
804 // followed by a chain of `get_attr` nodes (each with an `identifier` child).
805 let mut cursor = node.walk();
806 let mut child_iter = node.children(&mut cursor);
807 let Some(first) = child_iter.next() else {
808 return;
809 };
810 if first.kind() != "variable_expr" {
811 return;
812 }
813 let Some(first_id) = first.child_by_field_name("name").or_else(|| {
814 // Fallback: find first named child that's an identifier.
815 let mut c = first.walk();
816 first.children(&mut c).find(|n| n.kind() == "identifier")
817 }) else {
818 return;
819 };
820 if &source[first_id.start_byte()..first_id.end_byte()] != "data" {
821 return;
822 }
823
824 // Collect identifiers from the chain of get_attr.
825 let mut chain: Vec<String> = Vec::new();
826 for child in child_iter {
827 if child.kind() != "get_attr" {
828 return; // not a pure attribute chain
829 }
830 let mut gc = child.walk();
831 let id = child.children(&mut gc).find(|n| n.kind() == "identifier");
832 let Some(id_node) = id else { return };
833 chain.push(source[id_node.start_byte()..id_node.end_byte()].to_string());
834 }
835
836 // Expect: terraform_remote_state, <NAME>, outputs, <ATTR>
837 if chain.len() < 2 || chain[0] != "terraform_remote_state" {
838 return;
839 }
840 let name = chain[1].clone();
841 let qualified_path = format!("terraform_remote_state.{name}");
842
843 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
844 let call_byte = node.start_byte() as u32;
845 attach_hcl_call(defs, call_byte, name, Some(qualified_path));
846}
847
848/// Inspect an HCL `block` node for the `module "X" { source = "../X" }`
849/// pattern. Emits one CallRef per matching block.
850fn hcl_visit_block(source: &str, node: tree_sitter::Node<'_>, defs: &mut [Definition]) {
851 // First child must be identifier="module".
852 let mut cursor = node.walk();
853 let children: Vec<tree_sitter::Node<'_>> = node.children(&mut cursor).collect();
854 let Some(first) = children.first() else {
855 return;
856 };
857 if first.kind() != "identifier" || &source[first.start_byte()..first.end_byte()] != "module" {
858 return;
859 }
860 // Next child should be a string_lit (the module label).
861 let label_node = children.iter().find(|c| c.kind() == "string_lit");
862 let Some(label_node) = label_node else {
863 return;
864 };
865 let mut lc = label_node.walk();
866 let template = label_node
867 .children(&mut lc)
868 .find(|n| n.kind() == "template_literal");
869 let Some(template) = template else {
870 return;
871 };
872 let label = source[template.start_byte()..template.end_byte()].to_string();
873 let qualified_path = format!("module.{label}");
874
875 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
876 let call_byte = node.start_byte() as u32;
877 attach_hcl_call(defs, call_byte, label, Some(qualified_path));
878}
879
880/// Attach a synthesized HCL CallRef to the smallest enclosing definition.
881/// Mirrors the byte-range attribution from `extract_calls`.
882fn attach_hcl_call(
883 defs: &mut [Definition],
884 call_byte: u32,
885 name: String,
886 qualified_path: Option<String>,
887) {
888 let enclosing_idx = defs
889 .iter()
890 .enumerate()
891 .filter(|(_, d)| d.start_byte <= call_byte && call_byte < d.end_byte)
892 .min_by_key(|(_, d)| (d.end_byte - d.start_byte, is_callable_def_priority(&d.kind)))
893 .map(|(i, _)| i);
894 if let Some(idx) = enclosing_idx {
895 // Skip self-recursive emission (would happen if the enclosing def
896 // happens to share the same `name` as the synthesized callee).
897 if defs[idx].name != name {
898 defs[idx].calls.push(CallRef {
899 name,
900 qualified_path,
901 receiver_type: None,
902 byte_offset: call_byte,
903 resolved: None,
904 });
905 }
906 }
907}
908
909/// Infer method-call receiver types from local context within a parse tree.
910///
911/// Returns a map from `byte_offset_of_@callee_capture` to a receiver type string.
912///
913/// Dispatches to a language-specific collector:
914///
915/// - **Rust**: [`collect_rust_receiver_types`] — three heuristic cases:
916/// 1. `self.method()` inside `impl Foo { … }` → `"Foo"`.
917/// 2. `x.method()` where `x: Bar` is a function parameter → `"Bar"`.
918/// 3. `x.method()` after `let x = Foo::new()` → `"Foo"`.
919///
920/// - **Python**: [`collect_python_receiver_types`] — two heuristic cases:
921/// 1. `self.method()` inside a class method → class name from enclosing
922/// `class_definition`.
923/// 2. `instance.method()` where `instance: ClassName` type annotation or
924/// `instance = ClassName(...)` assignment is visible in the same scope.
925///
926/// - **Go**: [`collect_go_receiver_types`] — one heuristic case:
927/// 1. `recv.Method()` inside a `method_declaration` where `recv` is the
928/// named receiver parameter → receiver type from the method signature.
929///
930/// This is heuristic, not type-inference-complete. Unknown/ambiguous cases
931/// produce no entry in the map; `extract_calls` leaves those `receiver_type = None`.
932fn infer_receiver_types(
933 source: &str,
934 tree: &tree_sitter::Tree,
935 language: &tree_sitter::Language,
936) -> HashMap<u32, String> {
937 let mut map: HashMap<u32, String> = HashMap::new();
938
939 if languages::is_rust_language(language) {
940 collect_rust_receiver_types(source, tree.root_node(), &mut map);
941 } else if languages::is_python_language(language) {
942 collect_python_receiver_types(source, tree.root_node(), &mut map);
943 } else if languages::is_go_language(language) {
944 collect_go_receiver_types(source, tree.root_node(), &mut map);
945 }
946 // Other languages: no receiver inference — leave map empty.
947
948 map
949}
950
951/// Walk the Rust parse tree and fill `map` with receiver-type inference.
952///
953/// This is a recursive descent that tracks:
954/// - The current `impl Foo` or `impl Foo for Bar` type name (for `self.*` calls).
955/// - Parameter type annotations (for `x: SomeType` → `x` has type `SomeType`).
956/// - Constructor let-bindings (`let x = Foo::new()` → `x` has type `Foo`).
957fn collect_rust_receiver_types(
958 source: &str,
959 node: tree_sitter::Node<'_>,
960 map: &mut HashMap<u32, String>,
961) {
962 // We use a stack-based walk to avoid deep recursion on large files.
963 // Each stack entry carries (node, impl_type_context).
964 let mut stack: Vec<(tree_sitter::Node<'_>, Option<String>)> = vec![(node, None)];
965
966 while let Some((n, impl_ctx)) = stack.pop() {
967 match n.kind() {
968 "impl_item" => {
969 // Extract `impl Foo` or `impl Trait for Foo` → capture the `for` type.
970 // tree-sitter-rust shape: `(impl_item type: (type_identifier) @type)` for
971 // inherent impls, and `(impl_item trait: … type: (type_identifier) @type)`
972 // for trait impls. Both have a child named `type`.
973 let impl_type = extract_impl_self_type(source, n);
974 let new_ctx = impl_type.or_else(|| impl_ctx.clone());
975 let mut cursor = n.walk();
976 for child in n.children(&mut cursor) {
977 stack.push((child, new_ctx.clone()));
978 }
979 }
980 "function_item" => {
981 // Build parameter bindings: (param_name → type_name).
982 let param_types = extract_param_types(source, n);
983 // Build let-binding type map from constructor calls.
984 let let_types = extract_let_binding_types(source, n);
985 // Annotate call sites within this function body.
986 annotate_method_calls(
987 source,
988 n,
989 impl_ctx.as_deref(),
990 ¶m_types,
991 &let_types,
992 map,
993 );
994 // Do NOT recurse into function_item children with the outer stack —
995 // function bodies are fully handled by annotate_method_calls.
996 // (Nested fn items would re-enter via their own impl_item context.)
997 // Push children with same impl_ctx so nested impl blocks are found.
998 let mut cursor = n.walk();
999 for child in n.children(&mut cursor) {
1000 stack.push((child, impl_ctx.clone()));
1001 }
1002 }
1003 _ => {
1004 let mut cursor = n.walk();
1005 for child in n.children(&mut cursor) {
1006 stack.push((child, impl_ctx.clone()));
1007 }
1008 }
1009 }
1010 }
1011}
1012
1013/// Extract the self type from an `impl_item` node.
1014///
1015/// For `impl Foo { … }` → `Some("Foo")`.
1016/// For `impl Trait for Foo { … }` → `Some("Foo")` (the concrete `for` type).
1017fn extract_impl_self_type(source: &str, impl_node: tree_sitter::Node<'_>) -> Option<String> {
1018 // tree-sitter-rust: impl_item has a field named "type" for the self type.
1019 // For `impl Foo for Bar { }`, "type" is Bar; for `impl Foo { }`, "type" is Foo.
1020 let type_node = impl_node.child_by_field_name("type")?;
1021 Some(source[type_node.start_byte()..type_node.end_byte()].to_string())
1022}
1023
1024/// Extract parameter name → type mappings from a function signature.
1025///
1026/// Handles `fn foo(x: Bar, y: Baz)` → `{"x": "Bar", "y": "Baz"}`.
1027/// The `self`/`&self`/`&mut self` parameter is skipped (handled via impl_ctx).
1028fn extract_param_types(source: &str, fn_node: tree_sitter::Node<'_>) -> HashMap<String, String> {
1029 let mut params: HashMap<String, String> = HashMap::new();
1030 let Some(params_node) = fn_node.child_by_field_name("parameters") else {
1031 return params;
1032 };
1033 let mut cursor = params_node.walk();
1034 for param in params_node.children(&mut cursor) {
1035 if param.kind() == "parameter" {
1036 // parameter has children: pattern (identifier) and type
1037 let mut param_name = None;
1038 let mut param_type = None;
1039 let mut pc = param.walk();
1040 for child in param.children(&mut pc) {
1041 match child.kind() {
1042 "identifier" | "mutable_specifier" if param_name.is_none() => {
1043 let text = source[child.start_byte()..child.end_byte()].to_string();
1044 if text != "mut" {
1045 param_name = Some(text);
1046 }
1047 }
1048 "type_identifier"
1049 | "generic_type"
1050 | "reference_type"
1051 | "scoped_type_identifier"
1052 if param_type.is_none() =>
1053 {
1054 // Extract the base type identifier from potentially complex types.
1055 param_type = Some(extract_base_type(source, child));
1056 }
1057 _ => {}
1058 }
1059 }
1060 if let (Some(name), Some(ty)) = (param_name, param_type)
1061 && !ty.is_empty()
1062 {
1063 params.insert(name, ty);
1064 }
1065 }
1066 // Also handle typed_pattern in newer grammars
1067 if param.kind() == "typed_pattern" {
1068 let mut name_part = None;
1069 let mut type_part = None;
1070 let mut pc = param.walk();
1071 for child in param.children(&mut pc) {
1072 if child.kind() == "identifier" && name_part.is_none() {
1073 name_part = Some(source[child.start_byte()..child.end_byte()].to_string());
1074 } else if matches!(
1075 child.kind(),
1076 "type_identifier"
1077 | "generic_type"
1078 | "reference_type"
1079 | "scoped_type_identifier"
1080 ) && type_part.is_none()
1081 {
1082 type_part = Some(extract_base_type(source, child));
1083 }
1084 }
1085 if let (Some(name), Some(ty)) = (name_part, type_part)
1086 && !ty.is_empty()
1087 {
1088 params.insert(name, ty);
1089 }
1090 }
1091 }
1092 params
1093}
1094
1095/// Extract the base `TypeIdentifier` from a potentially complex type node.
1096///
1097/// For `Bar`, `&Bar`, `&mut Bar`, `Bar<T>` → returns `"Bar"`.
1098/// For `module::Bar` → returns `"Bar"` (bare name for matching).
1099fn extract_base_type(source: &str, node: tree_sitter::Node<'_>) -> String {
1100 match node.kind() {
1101 "type_identifier" => source[node.start_byte()..node.end_byte()].to_string(),
1102 "generic_type" | "reference_type" | "mutable_specifier" | "scoped_type_identifier" => {
1103 // Recurse to find the innermost type_identifier
1104 let mut cursor = node.walk();
1105 for child in node.children(&mut cursor) {
1106 let t = extract_base_type(source, child);
1107 if !t.is_empty() {
1108 return t;
1109 }
1110 }
1111 String::new()
1112 }
1113 _ => {
1114 // For other nodes, try children
1115 let mut cursor = node.walk();
1116 for child in node.children(&mut cursor) {
1117 if child.kind() == "type_identifier" {
1118 return source[child.start_byte()..child.end_byte()].to_string();
1119 }
1120 }
1121 String::new()
1122 }
1123 }
1124}
1125
1126/// Scan a function body for `let x = Foo::new()` patterns.
1127///
1128/// Returns a map from local variable name to the constructor type name.
1129/// E.g., `let x = Foo::new();` → `{"x": "Foo"}`.
1130fn extract_let_binding_types(
1131 source: &str,
1132 fn_node: tree_sitter::Node<'_>,
1133) -> HashMap<String, String> {
1134 let mut bindings: HashMap<String, String> = HashMap::new();
1135
1136 let Some(body) = fn_node.child_by_field_name("body") else {
1137 return bindings;
1138 };
1139
1140 // Walk the function body looking for let_declaration nodes.
1141 let mut stack = vec![body];
1142 while let Some(n) = stack.pop() {
1143 if n.kind() == "let_declaration" {
1144 // let_declaration: pattern (identifier) + value (call_expression or …)
1145 let mut binding_name = None;
1146 let mut constructor_type = None;
1147 let mut cursor = n.walk();
1148 for child in n.children(&mut cursor) {
1149 match child.kind() {
1150 "identifier" if binding_name.is_none() => {
1151 binding_name =
1152 Some(source[child.start_byte()..child.end_byte()].to_string());
1153 }
1154 "call_expression" => {
1155 // Look for `Foo::new()` or `Foo::from(…)` patterns.
1156 // The function child of call_expression is a scoped_identifier.
1157 if let Some(func) = child.child_by_field_name("function")
1158 && func.kind() == "scoped_identifier"
1159 {
1160 // scoped_identifier path: `Foo::new` — extract head segment.
1161 let full = source[func.start_byte()..func.end_byte()].to_string();
1162 let head = full.split("::").next().unwrap_or("").to_string();
1163 if !head.is_empty()
1164 && head.chars().next().is_some_and(char::is_uppercase)
1165 {
1166 constructor_type = Some(head);
1167 }
1168 }
1169 }
1170 _ => {}
1171 }
1172 }
1173 if let (Some(name), Some(ty)) = (binding_name, constructor_type) {
1174 bindings.insert(name, ty);
1175 }
1176 }
1177 // Push children for recursive walk.
1178 let mut cursor = n.walk();
1179 for child in n.children(&mut cursor) {
1180 stack.push(child);
1181 }
1182 }
1183
1184 bindings
1185}
1186
1187/// Walk a function body and annotate method-call byte offsets with receiver types.
1188///
1189/// A "method call" in the ripvec call query is:
1190/// `(call_expression function: (field_expression field: (field_identifier) @callee))`
1191///
1192/// The receiver is the `value` child of `field_expression`. This function
1193/// checks whether the receiver is:
1194/// - `self` → use `impl_ctx` type.
1195/// - An identifier matching a parameter type in `param_types`.
1196/// - An identifier matching a constructor let-binding in `let_types`.
1197fn annotate_method_calls(
1198 source: &str,
1199 fn_node: tree_sitter::Node<'_>,
1200 impl_ctx: Option<&str>,
1201 param_types: &HashMap<String, String>,
1202 let_types: &HashMap<String, String>,
1203 map: &mut HashMap<u32, String>,
1204) {
1205 // Walk the entire function (including its body) looking for call_expression nodes.
1206 let mut stack = vec![fn_node];
1207 while let Some(n) = stack.pop() {
1208 if n.kind() == "call_expression"
1209 && let Some(func) = n.child_by_field_name("function")
1210 && func.kind() == "field_expression"
1211 {
1212 // field_expression: value (receiver) + field (method name identifier)
1213 if let (Some(recv), Some(field)) = (
1214 func.child_by_field_name("value"),
1215 func.child_by_field_name("field"),
1216 ) {
1217 let recv_text = source[recv.start_byte()..recv.end_byte()].to_string();
1218 let receiver_type = if recv_text == "self" || recv_text == "*self" {
1219 impl_ctx.map(str::to_owned)
1220 } else {
1221 // Strip ref sigils for lookup.
1222 let base = recv_text
1223 .trim_start_matches('*')
1224 .trim_start_matches('&')
1225 .trim();
1226 param_types
1227 .get(base)
1228 .or_else(|| let_types.get(base))
1229 .cloned()
1230 };
1231
1232 if let Some(ty) = receiver_type {
1233 // The `@callee` capture byte offset is the start of the field node.
1234 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
1235 let field_byte = field.start_byte() as u32;
1236 map.insert(field_byte, ty);
1237 }
1238 }
1239 }
1240 let mut cursor = n.walk();
1241 for child in n.children(&mut cursor) {
1242 stack.push(child);
1243 }
1244 }
1245}
1246
1247// ── Python receiver-type heuristic ───────────────────────────────────
1248
1249/// Walk the Python parse tree and fill `map` with receiver-type inference.
1250///
1251/// Two heuristic cases:
1252///
1253/// 1. **`self.method()` inside a method** — when the `attribute` call receiver is
1254/// the literal text `self`, the receiver type is the name of the nearest
1255/// enclosing `class_definition`.
1256///
1257/// 2. **`instance.method()` with a type annotation or constructor call** —
1258/// when a function parameter has a PEP 484 annotation `param: ClassName` or
1259/// when a local assignment `param = ClassName(...)` precedes the call, the
1260/// receiver type is bound to `ClassName`.
1261///
1262/// The Python call query captures:
1263/// - `(call function: (attribute attribute: (identifier) @callee)) @call`
1264///
1265/// Within the `attribute` node, `value` is the receiver expression and
1266/// `attribute` is the method name (the `@callee` capture). The `@callee`
1267/// byte offset is the start of the `attribute` child identifier node.
1268fn collect_python_receiver_types(
1269 source: &str,
1270 root: tree_sitter::Node<'_>,
1271 map: &mut HashMap<u32, String>,
1272) {
1273 // Stack carries (node, class_ctx: Option<String>).
1274 // class_ctx is the name of the nearest enclosing class_definition.
1275 let mut stack: Vec<(tree_sitter::Node<'_>, Option<String>)> = vec![(root, None)];
1276
1277 while let Some((n, class_ctx)) = stack.pop() {
1278 match n.kind() {
1279 "class_definition" => {
1280 // Extract the class name from the `name` child.
1281 let class_name = n
1282 .child_by_field_name("name")
1283 .map(|c| source[c.start_byte()..c.end_byte()].to_string());
1284 let new_ctx = class_name.or_else(|| class_ctx.clone());
1285 let mut cursor = n.walk();
1286 for child in n.children(&mut cursor) {
1287 stack.push((child, new_ctx.clone()));
1288 }
1289 }
1290 "function_definition" => {
1291 // Build parameter annotation map: param_name → type_name.
1292 let param_types = extract_python_param_types(source, n);
1293 // Build local assignment map: var_name → constructor type.
1294 let let_types = extract_python_assignment_types(source, n);
1295 // Annotate attribute call sites within this function body.
1296 annotate_python_method_calls(
1297 source,
1298 n,
1299 class_ctx.as_deref(),
1300 ¶m_types,
1301 &let_types,
1302 map,
1303 );
1304 // Push children with same class_ctx so nested classes are found.
1305 let mut cursor = n.walk();
1306 for child in n.children(&mut cursor) {
1307 stack.push((child, class_ctx.clone()));
1308 }
1309 }
1310 _ => {
1311 let mut cursor = n.walk();
1312 for child in n.children(&mut cursor) {
1313 stack.push((child, class_ctx.clone()));
1314 }
1315 }
1316 }
1317 }
1318}
1319
1320/// Extract Python parameter name → type annotation mappings.
1321///
1322/// Handles PEP 484 style: `def foo(self, x: Bar, y: Baz) -> ...`.
1323/// The `self` parameter is excluded (handled via class_ctx).
1324/// Returns `{"x": "Bar", "y": "Baz"}`.
1325fn extract_python_param_types(
1326 source: &str,
1327 fn_node: tree_sitter::Node<'_>,
1328) -> HashMap<String, String> {
1329 let mut params: HashMap<String, String> = HashMap::new();
1330 let Some(params_node) = fn_node.child_by_field_name("parameters") else {
1331 return params;
1332 };
1333
1334 // Parameters node children include `identifier`, `typed_parameter`,
1335 // `typed_default_parameter`, and others.
1336 let mut cursor = params_node.walk();
1337 for param in params_node.children(&mut cursor) {
1338 match param.kind() {
1339 "typed_parameter" => {
1340 // (typed_parameter (identifier) @name type: (type) @type)
1341 // First identifier child is the name; type child is the type.
1342 let mut name_text = None;
1343 let mut type_text = None;
1344 let mut pc = param.walk();
1345 for child in param.children(&mut pc) {
1346 match child.kind() {
1347 "identifier" if name_text.is_none() => {
1348 let t = source[child.start_byte()..child.end_byte()].to_string();
1349 if t != "self" && t != "cls" {
1350 name_text = Some(t);
1351 }
1352 }
1353 "type" | "identifier" | "attribute"
1354 if type_text.is_none() && name_text.is_some() =>
1355 {
1356 // The type child in tree-sitter-python is a `type` node
1357 // whose text is the annotation expression. Extract the
1358 // base identifier (handle `Optional[Bar]`, `List[Bar]`, etc.)
1359 type_text = Some(extract_python_base_type(source, child));
1360 }
1361 _ => {}
1362 }
1363 }
1364 if let (Some(name), Some(ty)) = (name_text, type_text)
1365 && !ty.is_empty()
1366 && !ty.eq("self")
1367 && !ty.eq("cls")
1368 {
1369 params.insert(name, ty);
1370 }
1371 }
1372 "typed_default_parameter" => {
1373 // (typed_default_parameter name: (identifier) type: (type) value: …)
1374 let name_node = param.child_by_field_name("name");
1375 let type_node = param.child_by_field_name("type");
1376 if let (Some(nn), Some(tn)) = (name_node, type_node) {
1377 let name = source[nn.start_byte()..nn.end_byte()].to_string();
1378 if name != "self" && name != "cls" {
1379 let ty = extract_python_base_type(source, tn);
1380 if !ty.is_empty() {
1381 params.insert(name, ty);
1382 }
1383 }
1384 }
1385 }
1386 _ => {}
1387 }
1388 }
1389 params
1390}
1391
1392/// Extract the base type name from a Python type annotation node.
1393///
1394/// For `Bar` → `"Bar"`. For `Optional[Bar]` or `List[Bar]` → `"Bar"`.
1395/// For `module.Class` → `"Class"` (bare name only).
1396fn extract_python_base_type(source: &str, node: tree_sitter::Node<'_>) -> String {
1397 match node.kind() {
1398 "identifier" => source[node.start_byte()..node.end_byte()].to_string(),
1399 // tree-sitter-python wraps annotations in a `type` node
1400 "type" => {
1401 let mut cursor = node.walk();
1402 for child in node.children(&mut cursor) {
1403 let t = extract_python_base_type(source, child);
1404 if !t.is_empty() {
1405 return t;
1406 }
1407 }
1408 String::new()
1409 }
1410 // Generic alias: `Optional[Bar]` — the first identifier child is `Optional`,
1411 // the subscript child contains `Bar`. We want the subscript content.
1412 "subscript" => {
1413 // subscript has value (e.g. Optional) and subscript (e.g. Bar).
1414 // Return the subscript's base type (the inner type argument).
1415 if let Some(sub) = node.child_by_field_name("subscript") {
1416 return extract_python_base_type(source, sub);
1417 }
1418 // Fall back: first identifier
1419 let mut cursor = node.walk();
1420 for child in node.children(&mut cursor) {
1421 if child.kind() == "identifier" {
1422 return source[child.start_byte()..child.end_byte()].to_string();
1423 }
1424 }
1425 String::new()
1426 }
1427 // Attribute node `module.Class` → take last identifier
1428 "attribute" => {
1429 if let Some(attr) = node.child_by_field_name("attribute") {
1430 return source[attr.start_byte()..attr.end_byte()].to_string();
1431 }
1432 String::new()
1433 }
1434 _ => {
1435 // Try first identifier child
1436 let mut cursor = node.walk();
1437 for child in node.children(&mut cursor) {
1438 if child.kind() == "identifier" {
1439 return source[child.start_byte()..child.end_byte()].to_string();
1440 }
1441 }
1442 String::new()
1443 }
1444 }
1445}
1446
1447/// Scan a Python function body for `x = ClassName(...)` assignment patterns.
1448///
1449/// Returns a map from local variable name to constructor type.
1450/// E.g., `x = Foo()` → `{"x": "Foo"}`.
1451/// Also handles `x = module.ClassName(...)` → `{"x": "ClassName"}`.
1452fn extract_python_assignment_types(
1453 source: &str,
1454 fn_node: tree_sitter::Node<'_>,
1455) -> HashMap<String, String> {
1456 let mut bindings: HashMap<String, String> = HashMap::new();
1457 let Some(body) = fn_node.child_by_field_name("body") else {
1458 return bindings;
1459 };
1460
1461 let mut stack = vec![body];
1462 while let Some(n) = stack.pop() {
1463 if n.kind() == "assignment" {
1464 // assignment: left = right
1465 // We want: left is a simple identifier, right is a call whose
1466 // function is an identifier starting with an uppercase letter
1467 // (Python convention for class names).
1468 let left = n.child_by_field_name("left");
1469 let right = n.child_by_field_name("right");
1470 if let (Some(lhs), Some(rhs)) = (left, right)
1471 && lhs.kind() == "identifier"
1472 && rhs.kind() == "call"
1473 && let Some(func) = rhs.child_by_field_name("function")
1474 {
1475 let var_name = source[lhs.start_byte()..lhs.end_byte()].to_string();
1476 let constructor_type = match func.kind() {
1477 "identifier" => {
1478 let t = source[func.start_byte()..func.end_byte()].to_string();
1479 // Class names are conventionally uppercase-first
1480 if t.chars().next().is_some_and(char::is_uppercase) {
1481 Some(t)
1482 } else {
1483 None
1484 }
1485 }
1486 "attribute" => {
1487 // `module.ClassName(...)` — take the `attribute` part
1488 func.child_by_field_name("attribute")
1489 .map(|a| source[a.start_byte()..a.end_byte()].to_string())
1490 }
1491 _ => None,
1492 };
1493 if let Some(ty) = constructor_type {
1494 bindings.insert(var_name, ty);
1495 }
1496 }
1497 }
1498 let mut cursor = n.walk();
1499 for child in n.children(&mut cursor) {
1500 stack.push(child);
1501 }
1502 }
1503 bindings
1504}
1505
1506/// Walk a Python function body and annotate attribute-call byte offsets.
1507///
1508/// A Python method call is:
1509/// `(call function: (attribute value: <receiver> attribute: (identifier) @callee))`
1510///
1511/// The receiver is the `value` child of `attribute`. This function checks:
1512/// - `self` → use the enclosing class name (`class_ctx`).
1513/// - An identifier matching a parameter type in `param_types`.
1514/// - An identifier matching a constructor assignment in `let_types`.
1515fn annotate_python_method_calls(
1516 source: &str,
1517 fn_node: tree_sitter::Node<'_>,
1518 class_ctx: Option<&str>,
1519 param_types: &HashMap<String, String>,
1520 let_types: &HashMap<String, String>,
1521 map: &mut HashMap<u32, String>,
1522) {
1523 let mut stack = vec![fn_node];
1524 while let Some(n) = stack.pop() {
1525 if n.kind() == "call"
1526 && let Some(func) = n.child_by_field_name("function")
1527 && func.kind() == "attribute"
1528 && let (Some(recv_node), Some(attr_node)) = (
1529 func.child_by_field_name("object"),
1530 func.child_by_field_name("attribute"),
1531 )
1532 {
1533 // attribute node: object (receiver) + attribute (method name)
1534 let recv_text = source[recv_node.start_byte()..recv_node.end_byte()].to_string();
1535 let receiver_type = if recv_text == "self" || recv_text == "cls" {
1536 class_ctx.map(str::to_owned)
1537 } else if recv_node.kind() == "identifier" {
1538 param_types
1539 .get(&recv_text)
1540 .or_else(|| let_types.get(&recv_text))
1541 .cloned()
1542 } else {
1543 None
1544 };
1545
1546 if let Some(ty) = receiver_type {
1547 // The `@callee` capture byte offset is the `attribute` child.
1548 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
1549 let attr_byte = attr_node.start_byte() as u32;
1550 map.insert(attr_byte, ty);
1551 }
1552 }
1553 let mut cursor = n.walk();
1554 for child in n.children(&mut cursor) {
1555 stack.push(child);
1556 }
1557 }
1558}
1559
1560// ── Python class hierarchy (MRO) extraction ───────────────────────────
1561
1562/// Walk the Python parse tree and add class → parent-names entries to `out`.
1563///
1564/// tree-sitter-python shape:
1565/// ```text
1566/// (class_definition
1567/// name: (identifier) @child
1568/// superclasses: (argument_list
1569/// (identifier) @parent ; bare parent: class Foo(Bar):
1570/// (attribute attribute: (identifier) @parent) ; qualified: class Foo(mod.Bar):
1571/// (keyword_argument …) ; ignored: class Foo(Bar, metaclass=Meta):
1572/// )?
1573/// ...)
1574/// ```
1575///
1576/// Each map entry's key is a class defined in the file; the value is the
1577/// ordered list of declared parent class **names** (the trailing `attribute`
1578/// segment, so `mod.Bar` becomes `Bar`). Classes with no declared parents
1579/// still get an entry (empty `Vec`) so a downstream MRO walk can tell
1580/// "known class with no parents" from "unknown class".
1581fn extract_python_class_hierarchy_node(
1582 source: &str,
1583 root: tree_sitter::Node<'_>,
1584 out: &mut HashMap<String, Vec<String>>,
1585) {
1586 let mut stack = vec![root];
1587 while let Some(n) = stack.pop() {
1588 if n.kind() == "class_definition"
1589 && let Some(name_node) = n.child_by_field_name("name")
1590 {
1591 let class_name = source[name_node.start_byte()..name_node.end_byte()].to_string();
1592 let mut parents: Vec<String> = Vec::new();
1593 if let Some(superclasses) = n.child_by_field_name("superclasses") {
1594 // superclasses is an `argument_list`; iterate its children and
1595 // collect identifiers / attribute trailing-names. Skip
1596 // keyword_argument entries (metaclass=…, etc.) and punctuation.
1597 let mut sc = superclasses.walk();
1598 for child in superclasses.children(&mut sc) {
1599 match child.kind() {
1600 "identifier" => {
1601 let t = source[child.start_byte()..child.end_byte()].to_string();
1602 parents.push(t);
1603 }
1604 "attribute" => {
1605 // module.Cls → take the trailing `attribute` segment.
1606 if let Some(attr) = child.child_by_field_name("attribute") {
1607 parents
1608 .push(source[attr.start_byte()..attr.end_byte()].to_string());
1609 }
1610 }
1611 // Drop keyword_argument, "(", ")", ",", comments, etc.
1612 _ => {}
1613 }
1614 }
1615 }
1616 out.insert(class_name, parents);
1617 }
1618 let mut cursor = n.walk();
1619 for child in n.children(&mut cursor) {
1620 stack.push(child);
1621 }
1622 }
1623}
1624
1625/// Extract the Python `class → [parents]` map from a single source file by
1626/// parsing it with tree-sitter-python.
1627///
1628/// Returns an empty map when the source fails to parse or contains no
1629/// `class_definition` nodes. The returned map is the per-file contribution
1630/// to the global hierarchy used by [`resolve_calls_with_python_mro_pub`]
1631/// for MRO-aware receiver-type dispatch (Q1, Wave 2).
1632#[must_use]
1633pub fn extract_python_class_hierarchy(source: &str) -> HashMap<String, Vec<String>> {
1634 let mut parser = Parser::new();
1635 let lang: tree_sitter::Language = tree_sitter_python::LANGUAGE.into();
1636 if parser.set_language(&lang).is_err() {
1637 return HashMap::new();
1638 }
1639 let Some(tree) = parser.parse(source, None) else {
1640 return HashMap::new();
1641 };
1642 let mut out: HashMap<String, Vec<String>> = HashMap::new();
1643 extract_python_class_hierarchy_node(source, tree.root_node(), &mut out);
1644 out
1645}
1646
1647/// Compute the linearised MRO (Method Resolution Order) for a Python class
1648/// name using a **simplified left-first depth-first walk** of the declared
1649/// `class → [parents]` hierarchy.
1650///
1651/// Python's real MRO uses C3 linearisation, which is monotonic and respects
1652/// declaration order across the diamond inheritance shape. For ripvec's
1653/// reverse-call-graph purpose we want *any plausible ancestor* of the
1654/// receiver type — including ancestors only reachable via a mixin — so we
1655/// can resolve `self.method()` calls whose dispatch lands on an ancestor.
1656/// The simplification: pre-order DFS, left-to-right, skipping cycles via a
1657/// `visited` set.
1658///
1659/// On a non-diamond shape this matches C3 exactly. On a diamond the
1660/// simplified walk may surface an ancestor earlier than C3 would, but every
1661/// ancestor C3 would visit is still reached — and ripvec's goal is "find
1662/// the implementing def for an inherited call", not "compute the runtime
1663/// dispatch winner". Over-approximating ancestors only matters when two
1664/// ancestors define the same method, and even then the left-first order
1665/// matches C3 on the common patterns
1666/// (`class Sub(Base, Mixin)` → `Sub, Base, Mixin, <Base's ancestors>, <Mixin's ancestors>`).
1667///
1668/// The returned list excludes the start class itself. Each entry appears at
1669/// most once even when reachable through multiple parent chains.
1670fn compute_python_mro<H: std::hash::BuildHasher>(
1671 start: &str,
1672 hierarchy: &HashMap<String, Vec<String>, H>,
1673) -> Vec<String> {
1674 use std::collections::HashSet;
1675 let mut order: Vec<String> = Vec::new();
1676 let mut visited: HashSet<String> = HashSet::new();
1677 // Start with the immediate parents of `start` (the receiver-type's own
1678 // scope was already searched by Priority 2's direct match).
1679 let Some(start_parents) = hierarchy.get(start) else {
1680 return order;
1681 };
1682 // DFS stack: we push in reverse so pop yields left-first order.
1683 let mut stack: Vec<String> = start_parents.iter().rev().cloned().collect();
1684 while let Some(cls) = stack.pop() {
1685 if !visited.insert(cls.clone()) {
1686 continue;
1687 }
1688 order.push(cls.clone());
1689 if let Some(parents) = hierarchy.get(&cls) {
1690 for p in parents.iter().rev() {
1691 if !visited.contains(p) {
1692 stack.push(p.clone());
1693 }
1694 }
1695 }
1696 }
1697 order
1698}
1699
1700// ── Go receiver-type heuristic ────────────────────────────────────────
1701
1702/// Walk the Go parse tree and fill `map` with receiver-type inference.
1703///
1704/// One heuristic case: **`recv.Method()` inside a `method_declaration`**.
1705///
1706/// Go methods have an explicit receiver parameter in their signature:
1707/// `func (r *Foo) Bar() { r.Baz() }` — `r` is bound to type `Foo`.
1708///
1709/// The Go call query captures:
1710/// `(call_expression function: (selector_expression field: (field_identifier) @callee))`
1711///
1712/// Within `selector_expression`, `operand` is the receiver expression and
1713/// `field` is the method name (the `@callee` capture).
1714///
1715/// This function also handles `self.method()` patterns for cases where code
1716/// uses `self` as a receiver name (not idiomatic Go, but it occurs).
1717fn collect_go_receiver_types(
1718 source: &str,
1719 root: tree_sitter::Node<'_>,
1720 map: &mut HashMap<u32, String>,
1721) {
1722 // Stack carries (node, receiver_binding: Option<(recv_name, recv_type)>).
1723 let mut stack: Vec<(tree_sitter::Node<'_>, Option<(String, String)>)> = vec![(root, None)];
1724
1725 while let Some((n, recv_binding)) = stack.pop() {
1726 if n.kind() == "method_declaration" {
1727 // Extract the receiver name and type from the method signature.
1728 let binding = extract_go_receiver_binding(source, n);
1729 let new_binding = binding.or_else(|| recv_binding.clone());
1730 let mut cursor = n.walk();
1731 for child in n.children(&mut cursor) {
1732 stack.push((child, new_binding.clone()));
1733 }
1734 } else {
1735 // For any call_expression whose function is a selector_expression,
1736 // check if the operand matches the active receiver binding.
1737 if n.kind() == "call_expression"
1738 && let Some(func) = n.child_by_field_name("function")
1739 && func.kind() == "selector_expression"
1740 && let (Some(operand), Some(field)) = (
1741 func.child_by_field_name("operand"),
1742 func.child_by_field_name("field"),
1743 )
1744 {
1745 let recv_text = source[operand.start_byte()..operand.end_byte()].to_string();
1746 let receiver_type = recv_binding.as_ref().and_then(|(recv_name, recv_ty)| {
1747 if recv_text == *recv_name {
1748 Some(recv_ty.clone())
1749 } else {
1750 None
1751 }
1752 });
1753
1754 if let Some(ty) = receiver_type {
1755 // The `@callee` capture byte offset is the `field` child.
1756 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
1757 let field_byte = field.start_byte() as u32;
1758 map.insert(field_byte, ty);
1759 }
1760 }
1761
1762 let mut cursor = n.walk();
1763 for child in n.children(&mut cursor) {
1764 stack.push((child, recv_binding.clone()));
1765 }
1766 }
1767 }
1768}
1769
1770/// Extract the receiver name and base type from a Go `method_declaration`.
1771///
1772/// Go method declaration shape (tree-sitter-go):
1773/// ```text
1774/// (method_declaration
1775/// receiver: (parameter_list
1776/// (parameter_declaration
1777/// name: (identifier) ← receiver name
1778/// type: (type_identifier ← receiver type (bare)
1779/// | pointer_type (type_identifier)) ← or *Type
1780/// )
1781/// )
1782/// name: (field_identifier)
1783/// ...
1784/// )
1785/// ```
1786///
1787/// Returns `Some((recv_name, type_name))` or `None` if the receiver is unnamed
1788/// (blank identifier `_`) or has an unrecognisable shape.
1789fn extract_go_receiver_binding(
1790 source: &str,
1791 method_node: tree_sitter::Node<'_>,
1792) -> Option<(String, String)> {
1793 let receiver_list = method_node.child_by_field_name("receiver")?;
1794 // parameter_list contains one parameter_declaration
1795 let mut cursor = receiver_list.walk();
1796 for param in receiver_list.children(&mut cursor) {
1797 if param.kind() == "parameter_declaration" {
1798 let name_node = param.child_by_field_name("name");
1799 let type_node = param.child_by_field_name("type");
1800 if let (Some(nn), Some(tn)) = (name_node, type_node) {
1801 let name = source[nn.start_byte()..nn.end_byte()].to_string();
1802 if name == "_" || name.is_empty() {
1803 return None;
1804 }
1805 let ty = extract_go_base_type(source, tn);
1806 if !ty.is_empty() {
1807 return Some((name, ty));
1808 }
1809 }
1810 }
1811 }
1812 None
1813}
1814
1815/// Extract the base type name from a Go type node.
1816///
1817/// For `Foo` (type_identifier) → `"Foo"`.
1818/// For `*Foo` (pointer_type → type_identifier) → `"Foo"`.
1819fn extract_go_base_type(source: &str, node: tree_sitter::Node<'_>) -> String {
1820 match node.kind() {
1821 "type_identifier" => source[node.start_byte()..node.end_byte()].to_string(),
1822 "pointer_type" => {
1823 // pointer_type has one child: the pointee type
1824 let mut cursor = node.walk();
1825 for child in node.children(&mut cursor) {
1826 if child.kind() == "type_identifier" {
1827 return source[child.start_byte()..child.end_byte()].to_string();
1828 }
1829 let t = extract_go_base_type(source, child);
1830 if !t.is_empty() {
1831 return t;
1832 }
1833 }
1834 String::new()
1835 }
1836 _ => {
1837 let mut cursor = node.walk();
1838 for child in node.children(&mut cursor) {
1839 if child.kind() == "type_identifier" {
1840 return source[child.start_byte()..child.end_byte()].to_string();
1841 }
1842 }
1843 String::new()
1844 }
1845 }
1846}
1847
1848/// Enrich Go `method_declaration` definition scopes with their receiver type name.
1849///
1850/// In the generic `extract_definitions` path, `build_scope_chain` walks the
1851/// *parent* chain of the `@def` node. For Go `method_declaration`, the parent
1852/// is the file root — so the scope is always `""`.
1853///
1854/// An empty scope means `resolve_calls` Priority 2 (receiver-type matching via
1855/// `scope.contains(recv_type)`) never fires for Go methods. Cross-file calls
1856/// where the caller inferred `receiver_type = Some("Foo")` stay unresolved;
1857/// no edge is recorded; `def_callers[]` stays empty for those defs — the root
1858/// cause of the missing inverse index for Go (I#P1).
1859///
1860/// Fix: after `extract_definitions`, parse the Go source a second time to find
1861/// each `method_declaration`'s receiver type, then set the matching def's scope
1862/// to `"method_declaration {ReceiverType}"`. This matches the pattern used by
1863/// the existing `go_resolve_receiver_method_via_signature` integration test,
1864/// which asserts that `scope.contains("Foo")` succeeds when the scope is
1865/// `"method_declaration Foo"`.
1866///
1867/// Matching is by `start_byte` (precise) so name collisions across different
1868/// receiver types are handled correctly.
1869fn enrich_go_method_def_scopes(source: &str, defs: &mut [Definition]) {
1870 let go_lang: tree_sitter::Language = tree_sitter_go::LANGUAGE.into();
1871 let mut parser = Parser::new();
1872 if parser.set_language(&go_lang).is_err() {
1873 return;
1874 }
1875 let Some(tree) = parser.parse(source, None) else {
1876 return;
1877 };
1878
1879 // Walk all top-level method_declaration nodes.
1880 let root = tree.root_node();
1881 let mut method_cursor = root.walk();
1882 for child in root.children(&mut method_cursor) {
1883 if child.kind() != "method_declaration" {
1884 continue;
1885 }
1886 let Some((_, recv_type)) = extract_go_receiver_binding(source, child) else {
1887 continue;
1888 };
1889 // Match by start_byte (precise): the @def node for method_declaration in
1890 // the Go definition query is the method_declaration node itself, so its
1891 // start_byte matches the def's start_byte recorded during extract_definitions.
1892 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
1893 let method_start_byte = child.start_byte() as u32;
1894 for def in defs.iter_mut() {
1895 if def.kind == "method_declaration" && def.start_byte == method_start_byte {
1896 def.scope = format!("method_declaration {recv_type}");
1897 break;
1898 }
1899 }
1900 }
1901}
1902
1903/// Public wrapper for `enrich_go_method_def_scopes` — enables integration tests
1904/// to call it directly without going through the full `build_graph` pipeline.
1905pub fn enrich_go_method_def_scopes_pub(source: &str, defs: &mut [Definition]) {
1906 enrich_go_method_def_scopes(source, defs);
1907}
1908
1909/// SQL: prepend a synthetic whole-file definition whose name is the filename stem.
1910///
1911/// dbt and sqlmesh follow a filename-as-model-name convention:
1912///
1913/// - `silver_issuer_returns.sql` defines the `silver_issuer_returns` model.
1914/// - `gold_issuer_returns.sql` references the silver model by filename stem,
1915/// not by any in-source CREATE TABLE.
1916///
1917/// In sqlmesh, the in-source name is templated:
1918///
1919/// ```sql
1920/// MODEL (
1921/// name @{athena_sqlmesh_silver_schema}.issuer_returns,
1922/// ...
1923/// );
1924/// SELECT ... FROM @{athena_sqlmesh_silver_schema}.stg_issuer_returns;
1925/// ```
1926///
1927/// The `MODEL (...)` header parses as an ERROR node under tree-sitter-sequel
1928/// because `@{var}` is not standard SQL; FROM/JOIN further down the file still
1929/// extract cleanly. Without a synthetic def, `lsp_workspace_symbols(query=
1930/// "silver_issuer_returns")` returns no hits — there is no real CREATE TABLE
1931/// in the file and the model name is interpolation only.
1932///
1933/// This helper prepends a definition with:
1934/// - `name` = filename stem (e.g., `silver_issuer_returns`)
1935/// - `kind` = `"sql_file"` (maps to `LSP SymbolKind::File` in
1936/// [`languages::lsp_symbol_kind_for_node_kind`])
1937/// - byte range = the entire source `[0, source.len())`
1938/// - scope / signature / qualified_name = empty / None
1939///
1940/// The whole-file byte range is the key to FROM/JOIN attribution: when
1941/// `extract_calls` later places a CallRef from a FROM clause that is not
1942/// inside any CTE or other smaller def, the smallest-enclosing-def search
1943/// lands on this synthetic file def and the edge is recorded.
1944///
1945/// If the filename has no stem (empty / `..`), the helper is a no-op.
1946/// Idempotent: if a `sql_file` def already exists at byte 0, it is left alone.
1947pub(crate) fn enrich_sql_file_def(filename: &str, source: &str, defs: &mut Vec<Definition>) {
1948 // Idempotency: do nothing if a sql_file def is already present at byte 0.
1949 if defs
1950 .iter()
1951 .any(|d| d.kind == "sql_file" && d.start_byte == 0)
1952 {
1953 return;
1954 }
1955
1956 // Derive the filename stem (last path component, file extension stripped).
1957 let stem = std::path::Path::new(filename)
1958 .file_stem()
1959 .and_then(|s| s.to_str())
1960 .unwrap_or_default();
1961 if stem.is_empty() {
1962 return;
1963 }
1964
1965 // Count newlines so end_line is reasonable for downstream UI.
1966 let end_line_zero_based = source.bytes().filter(|&b| b == b'\n').count();
1967 #[expect(clippy::cast_possible_truncation, reason = "line counts fit in u32")]
1968 let end_line = (end_line_zero_based as u32) + 1;
1969 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
1970 let end_byte = source.len() as u32;
1971
1972 let file_def = Definition {
1973 name: stem.to_string(),
1974 kind: "sql_file".to_string(),
1975 start_line: 1,
1976 end_line,
1977 scope: String::new(),
1978 signature: None,
1979 start_byte: 0,
1980 end_byte,
1981 calls: vec![],
1982 };
1983 // Prepend so it remains the outermost (largest) enclosing def at byte 0,
1984 // ensuring narrow CTE defs are still preferred for inner-FROM attribution
1985 // (the smallest-enclosing rule in `extract_calls`).
1986 defs.insert(0, file_def);
1987}
1988
1989/// Public wrapper for [`enrich_sql_file_def`] — enables integration tests
1990/// to call it directly without going through the full `build_graph` pipeline.
1991///
1992/// This is `pub` (not `pub(crate)`) because integration tests in
1993/// `crates/ripvec-core/tests/` are in a separate crate and cannot access
1994/// `pub(crate)` items.
1995pub fn enrich_sql_file_def_pub(filename: &str, source: &str, defs: &mut Vec<Definition>) {
1996 enrich_sql_file_def(filename, source, defs);
1997}
1998
1999/// Public wrapper for `extract_calls` — enables integration tests to call it
2000/// directly without going through the full `build_graph` pipeline.
2001///
2002/// This is `pub` (not `pub(crate)`) because integration tests in
2003/// `crates/ripvec-core/tests/` are in a separate crate and cannot access
2004/// `pub(crate)` items.
2005pub fn extract_calls_pub(
2006 source: &str,
2007 call_config: &languages::CallConfig,
2008 defs: &mut [Definition],
2009) {
2010 extract_calls(source, call_config, defs);
2011}
2012
2013/// Build an index from definition name to list of `DefId`s.
2014#[must_use]
2015pub fn build_def_index_pub(files: &[FileNode]) -> HashMap<String, Vec<DefId>> {
2016 build_def_index(files)
2017}
2018
2019fn build_def_index(files: &[FileNode]) -> HashMap<String, Vec<DefId>> {
2020 let mut index: HashMap<String, Vec<DefId>> = HashMap::new();
2021 for (file_idx, file) in files.iter().enumerate() {
2022 for (def_idx, def) in file.defs.iter().enumerate() {
2023 #[expect(clippy::cast_possible_truncation)]
2024 let did: DefId = (file_idx as u32, def_idx as u16);
2025 index.entry(def.name.clone()).or_default().push(did);
2026 }
2027 }
2028 index
2029}
2030
2031/// Resolve call references to target definitions.
2032///
2033/// Resolution priority:
2034///
2035/// 1. **Qualified path** (`qualified_path = Some("mod_a::foo")`): filter candidates
2036/// by qualifier match (file path or scope contains the qualifier segment). Unique
2037/// match → resolve; ambiguous or no match → leave `None`.
2038/// 2. **Receiver type — direct scope match** (`receiver_type = Some("Foo")`):
2039/// for method calls, prefer candidates whose `scope` contains the receiver
2040/// type name (e.g., `"impl_item Foo"`). Among receiver-matching candidates,
2041/// further prefer those in imported files. Unique match → resolve;
2042/// ambiguous → leave `None`.
2043/// When this step finds nothing on the receiver class itself, sub-step 2b
2044/// (Python MRO walk) runs: when `Foo` has a recorded parent chain, walk
2045/// the receiver class's MRO (left-first DFS) and try the scope-match
2046/// against each ancestor's name. First ancestor with a matching candidate
2047/// wins. See [`compute_python_mro`] for the simplification rationale (it
2048/// diverges from C3 only on diamond shapes where two ancestors define the
2049/// same name).
2050/// 3. **Same file** (unqualified, no receiver): prefer definitions in the caller's
2051/// own file.
2052/// 4. **SQL suffix-match** (sql_file callers only, no exact-name match): when
2053/// the caller def has `kind = "sql_file"` and the bare `call_name` (e.g.,
2054/// `"issuer_returns"`) has no exact entry in the def index, scan all
2055/// `sql_file` defs for names ending with `_<call_name>` (e.g.,
2056/// `"silver_issuer_returns"`). This bridges dbt / sqlmesh layered schema
2057/// prefixes: `gold_issuer_returns.sql` uses `FROM @{schema}.issuer_returns`
2058/// which tree-sitter reduces to the bare name `"issuer_returns"`, while the
2059/// target def is the synthetic `sql_file` def named `"silver_issuer_returns"`.
2060/// Unique suffix-match → resolve; ambiguous (multiple layers match) or
2061/// no match → leave `None`. Non-sql_file callers are explicitly excluded.
2062/// 5. **Imported file** (unqualified, no receiver): check definitions in files this
2063/// file imports. Unique imported candidate → resolve.
2064/// 6. **Ambiguous or unresolved**: leave `resolved` as `None` (no silent first-wins).
2065///
2066/// Equivalent to [`resolve_calls_with_python_mro_pub`] with an empty MRO map
2067/// (Priority 2.5 is a no-op).
2068pub fn resolve_calls_pub<S: std::hash::BuildHasher>(
2069 files: &mut [FileNode],
2070 def_index: &HashMap<String, Vec<DefId>, S>,
2071) {
2072 let empty: HashMap<String, Vec<String>> = HashMap::new();
2073 resolve_calls_inner(files, def_index, &empty);
2074}
2075
2076/// Resolve call references with MRO-aware Python receiver dispatch enabled.
2077///
2078/// Identical to [`resolve_calls_pub`] except that Priority 2.5 (the MRO walk)
2079/// fires when the caller passes a non-empty `python_class_hierarchy`.
2080/// `build_graph` populates the hierarchy by parsing every Python source file
2081/// with [`extract_python_class_hierarchy`] and merging the per-file maps.
2082///
2083/// Tests that want to exercise MRO resolution without going through
2084/// `build_graph` can call this directly with a synthetic hierarchy.
2085pub fn resolve_calls_with_python_mro_pub<S, H>(
2086 files: &mut [FileNode],
2087 def_index: &HashMap<String, Vec<DefId>, S>,
2088 python_class_hierarchy: &HashMap<String, Vec<String>, H>,
2089) where
2090 S: std::hash::BuildHasher,
2091 H: std::hash::BuildHasher,
2092{
2093 resolve_calls_inner(files, def_index, python_class_hierarchy);
2094}
2095
2096fn resolve_calls<S, H>(
2097 files: &mut [FileNode],
2098 def_index: &HashMap<String, Vec<DefId>, S>,
2099 python_class_hierarchy: &HashMap<String, Vec<String>, H>,
2100) where
2101 S: std::hash::BuildHasher,
2102 H: std::hash::BuildHasher,
2103{
2104 resolve_calls_inner(files, def_index, python_class_hierarchy);
2105}
2106
2107#[expect(
2108 clippy::too_many_lines,
2109 reason = "7-priority resolution cascade (qualified path, receiver type, MRO walk, same-file, \
2110 SQL suffix-match, imported-file, ambiguous); each priority is a distinct decision \
2111 branch and extracting helpers would require passing large shared state across boundaries"
2112)]
2113fn resolve_calls_inner<S, H>(
2114 files: &mut [FileNode],
2115 def_index: &HashMap<String, Vec<DefId>, S>,
2116 python_class_hierarchy: &HashMap<String, Vec<String>, H>,
2117) where
2118 S: std::hash::BuildHasher,
2119 H: std::hash::BuildHasher,
2120{
2121 // Pre-compute imported file sets for each file.
2122 let imported_files: Vec<std::collections::HashSet<u32>> = files
2123 .iter()
2124 .map(|f| {
2125 f.imports
2126 .iter()
2127 .filter_map(|imp| imp.resolved_idx)
2128 .collect()
2129 })
2130 .collect();
2131
2132 for file_idx in 0..files.len() {
2133 for def_idx in 0..files[file_idx].defs.len() {
2134 for call_idx in 0..files[file_idx].defs[def_idx].calls.len() {
2135 let call_name = files[file_idx].defs[def_idx].calls[call_idx].name.clone();
2136 let qualified_path = files[file_idx].defs[def_idx].calls[call_idx]
2137 .qualified_path
2138 .clone();
2139 let receiver_type = files[file_idx].defs[def_idx].calls[call_idx]
2140 .receiver_type
2141 .clone();
2142
2143 // HCL: dedicated resolution for `terraform_remote_state.<NAME>`
2144 // and `module.<NAME>` qualified paths. Aurora's module DAG is
2145 // expressed by these patterns; resolve to the first def in any
2146 // file under a `/<NAME>/` directory segment. This is the
2147 // module-source contract (R2 + R3, Wave 3).
2148 if let Some(ref qpath) = qualified_path
2149 && (qpath.starts_with("terraform_remote_state.")
2150 || qpath.starts_with("module."))
2151 {
2152 let target = &call_name; // already the bare module label
2153 let segment_match = format!("/{target}/");
2154 let alt_segment_prefix = format!("{target}/"); // when path starts with target dir
2155 let candidate = files.iter().enumerate().find_map(|(idx, f)| {
2156 if f.path.contains(&segment_match)
2157 || f.path.starts_with(&alt_segment_prefix)
2158 {
2159 // Pick the first def in the file (or skip if file
2160 // has no defs).
2161 if !f.defs.is_empty() {
2162 #[expect(
2163 clippy::cast_possible_truncation,
2164 reason = "file index fits in u32"
2165 )]
2166 {
2167 return Some((idx as u32, 0u16));
2168 }
2169 }
2170 }
2171 None
2172 });
2173 if let Some(did) = candidate {
2174 files[file_idx].defs[def_idx].calls[call_idx].resolved = Some(did);
2175 }
2176 continue;
2177 }
2178
2179 // ── Priority 1: Qualified-path resolution ────────────────
2180 //
2181 // `qualified_path` carries the full scoped path (e.g. "mod_a::foo").
2182 // We look up candidates by the bare `call_name`, then filter by
2183 // whether the file path or scope contains the qualifier prefix.
2184 if let Some(ref qpath) = qualified_path {
2185 // Qualifier is everything before the final `::`.
2186 let qualifier = if let Some(pos) = qpath.rfind("::") {
2187 &qpath[..pos]
2188 } else {
2189 qpath.as_str()
2190 };
2191 let qual_segments: Vec<&str> = qualifier.split("::").collect();
2192
2193 let Some(candidates) = def_index.get(&call_name) else {
2194 continue;
2195 };
2196
2197 let matching: Vec<DefId> = candidates
2198 .iter()
2199 .copied()
2200 .filter(|&(f_idx, _)| {
2201 let file_path = &files[f_idx as usize].path;
2202 let last_segment = qual_segments.last().copied().unwrap_or("");
2203 let path_as_module =
2204 file_path.trim_end_matches(".rs").replace(['/', '\\'], "::");
2205 path_as_module.contains(last_segment)
2206 || file_path.contains(last_segment)
2207 })
2208 .collect();
2209
2210 if matching.len() == 1 {
2211 files[file_idx].defs[def_idx].calls[call_idx].resolved = Some(matching[0]);
2212 }
2213 // Ambiguous or no match → leave None.
2214 continue;
2215 }
2216
2217 // ── Priority 3.5: SQL suffix-match resolution ────────────
2218 //
2219 // dbt / sqlmesh pipelines use layered schema prefixes:
2220 // `silver_issuer_returns.sql` defines the silver-layer model.
2221 // `gold_issuer_returns.sql` references it via a FROM clause
2222 // that tree-sitter parses as `name: "issuer_returns"` (the
2223 // `name:` field-selector strips the `@{schema}.` prefix).
2224 // `def_index.get("issuer_returns")` returns None — the def is
2225 // stored under "silver_issuer_returns".
2226 //
2227 // When: (a) no exact-name candidate exists, AND (b) the
2228 // caller's enclosing def is a sql_file (whole-file synthetic
2229 // def emitted by `enrich_sql_file_def`), walk every sql_file
2230 // def in the graph and check whether its name ends with
2231 // `_<call_name>`. Unique suffix-match → resolve. Ambiguous
2232 // (e.g., both gold_ and silver_ match the same bare name) →
2233 // leave None (no silent first-wins).
2234 //
2235 // Non-sql_file callers are explicitly excluded: a Rust
2236 // function_item whose call_name happens to end with a suffix
2237 // of some sql_file def must NOT be resolved via this path.
2238 if !def_index.contains_key(&call_name)
2239 && files[file_idx].defs[def_idx].kind == "sql_file"
2240 && !call_name.is_empty()
2241 {
2242 let suffix = format!("_{call_name}");
2243 let suffix_str = suffix.as_str();
2244 // Exclude the caller def itself from the suffix scan:
2245 // `gold_issuer_returns` also ends with `_issuer_returns`
2246 // but must not self-resolve.
2247 #[expect(clippy::cast_possible_truncation, reason = "file index fits in u32")]
2248 let caller_did: DefId = (file_idx as u32, def_idx as u16);
2249 let suffix_matches: Vec<DefId> = files
2250 .iter()
2251 .enumerate()
2252 .flat_map(|(f_idx, f)| {
2253 f.defs.iter().enumerate().filter_map(move |(d_idx, d)| {
2254 #[expect(
2255 clippy::cast_possible_truncation,
2256 reason = "file and def indices fit in u32/u16"
2257 )]
2258 let did: DefId = (f_idx as u32, d_idx as u16);
2259 if d.kind == "sql_file"
2260 && d.name.ends_with(suffix_str)
2261 && did != caller_did
2262 {
2263 Some(did)
2264 } else {
2265 None
2266 }
2267 })
2268 })
2269 .collect();
2270 if suffix_matches.len() == 1 {
2271 files[file_idx].defs[def_idx].calls[call_idx].resolved =
2272 Some(suffix_matches[0]);
2273 }
2274 // Ambiguous (>1) or no match (0) → leave None.
2275 continue;
2276 }
2277
2278 let Some(candidates) = def_index.get(&call_name) else {
2279 continue;
2280 };
2281
2282 // ── Priority 2: Receiver-type resolution ─────────────────
2283 //
2284 // `receiver_type = Some("Foo")` means this is a method call on a
2285 // value whose type is `Foo`. Filter candidates to those whose scope
2286 // chain contains the receiver type name.
2287 if let Some(ref rtype) = receiver_type {
2288 // Candidates whose scope contains the receiver type name.
2289 let receiver_matching: Vec<DefId> = candidates
2290 .iter()
2291 .copied()
2292 .filter(|&(f_idx, d_idx)| {
2293 let scope = &files[f_idx as usize].defs[d_idx as usize].scope;
2294 scope.contains(rtype.as_str())
2295 })
2296 .collect();
2297
2298 if receiver_matching.len() == 1 {
2299 files[file_idx].defs[def_idx].calls[call_idx].resolved =
2300 Some(receiver_matching[0]);
2301 continue;
2302 }
2303
2304 if receiver_matching.len() > 1 {
2305 // Among receiver-matching candidates, prefer those in imported files.
2306 let imported_receiver_matching: Vec<DefId> = receiver_matching
2307 .iter()
2308 .copied()
2309 .filter(|(f, _)| imported_files[file_idx].contains(f))
2310 .collect();
2311 if imported_receiver_matching.len() == 1 {
2312 files[file_idx].defs[def_idx].calls[call_idx].resolved =
2313 Some(imported_receiver_matching[0]);
2314 }
2315 // Ambiguous even after import filter → leave None.
2316 continue;
2317 }
2318
2319 // ── Priority 2.5: Python MRO walk ─────────────────────
2320 //
2321 // The receiver type's own scope has no matching def — but
2322 // the method may live on an ancestor class. Walk the MRO
2323 // (left-first DFS) and try the scope-match against each
2324 // ancestor's name. First ancestor with at least one
2325 // scope-matching candidate wins; if multiple candidates
2326 // match for the same ancestor, prefer imported files,
2327 // else take the first in stable order.
2328 //
2329 // Liskov: a subclass's `self.method()` call must dispatch
2330 // through the MRO; over-approximating ancestors is the
2331 // correct conservative move for a reverse call graph.
2332 // For non-Python languages or Python receivers with no
2333 // recorded parents, `compute_python_mro` returns an
2334 // empty vector and this loop is a no-op.
2335 let mro = compute_python_mro(rtype, python_class_hierarchy);
2336 let mut resolved_via_mro: Option<DefId> = None;
2337 for ancestor in &mro {
2338 let ancestor_matching: Vec<DefId> = candidates
2339 .iter()
2340 .copied()
2341 .filter(|&(f_idx, d_idx)| {
2342 let scope = &files[f_idx as usize].defs[d_idx as usize].scope;
2343 scope.contains(ancestor.as_str())
2344 })
2345 .collect();
2346 if ancestor_matching.len() == 1 {
2347 resolved_via_mro = Some(ancestor_matching[0]);
2348 break;
2349 }
2350 if ancestor_matching.len() > 1 {
2351 // Prefer imported files among the ancestor matches.
2352 let imported_ancestor: Vec<DefId> = ancestor_matching
2353 .iter()
2354 .copied()
2355 .filter(|(f, _)| imported_files[file_idx].contains(f))
2356 .collect();
2357 if imported_ancestor.len() == 1 {
2358 resolved_via_mro = Some(imported_ancestor[0]);
2359 break;
2360 }
2361 // Ambiguous at this ancestor — pick the first
2362 // candidate in stable order. The MRO walk's job
2363 // is to find *an* implementing def for an
2364 // inherited call, not to compute the runtime
2365 // dispatch winner; any plausible candidate is
2366 // useful for the reverse call graph.
2367 resolved_via_mro = Some(ancestor_matching[0]);
2368 break;
2369 }
2370 }
2371 if let Some(did) = resolved_via_mro {
2372 files[file_idx].defs[def_idx].calls[call_idx].resolved = Some(did);
2373 continue;
2374 }
2375 // No receiver-matching candidates anywhere in the MRO →
2376 // fall through to bare-name resolution.
2377 }
2378
2379 // ── Priority 3: Same-file resolution ─────────────────────
2380 #[expect(clippy::cast_possible_truncation)]
2381 let file_idx_u32 = file_idx as u32;
2382 if let Some(&did) = candidates.iter().find(|(f, _)| *f == file_idx_u32) {
2383 files[file_idx].defs[def_idx].calls[call_idx].resolved = Some(did);
2384 continue;
2385 }
2386
2387 // ── Priority 4: Imported-file resolution ──────────────────
2388 let imported_candidates: Vec<DefId> = candidates
2389 .iter()
2390 .copied()
2391 .filter(|(f, _)| imported_files[file_idx].contains(f))
2392 .collect();
2393 if imported_candidates.len() == 1 {
2394 files[file_idx].defs[def_idx].calls[call_idx].resolved =
2395 Some(imported_candidates[0]);
2396 }
2397 // Priority 5: Ambiguous or unresolved → leave None.
2398 }
2399 }
2400 }
2401}
2402
2403/// Compute a prefix-sum offset table for flattening `DefId`s to linear indices.
2404fn def_offsets(files: &[FileNode]) -> Vec<usize> {
2405 let mut offsets = Vec::with_capacity(files.len() + 1);
2406 offsets.push(0);
2407 for file in files {
2408 offsets.push(offsets.last().unwrap() + file.defs.len());
2409 }
2410 offsets
2411}
2412
2413/// Flatten a `DefId` to a linear index using the offset table.
2414fn flatten_def_id(offsets: &[usize], did: DefId) -> usize {
2415 offsets[did.0 as usize] + did.1 as usize
2416}
2417
2418/// Build top-N caller and callee lists for each definition (flattened).
2419fn build_def_neighbor_lists(
2420 n: usize,
2421 edges: &[(u32, u32, u32)],
2422 offsets: &[usize],
2423) -> (Vec<Vec<DefId>>, Vec<Vec<DefId>>) {
2424 let mut incoming: Vec<Vec<(u32, u32)>> = vec![vec![]; n];
2425 let mut outgoing: Vec<Vec<(u32, u32)>> = vec![vec![]; n];
2426
2427 for &(src, dst, w) in edges {
2428 let (s, d) = (src as usize, dst as usize);
2429 if s < n && d < n {
2430 incoming[d].push((src, w));
2431 outgoing[s].push((dst, w));
2432 }
2433 }
2434
2435 // Convert flat index back to DefId
2436 let to_def_id = |flat: u32| -> DefId {
2437 let flat_usize = flat as usize;
2438 let file_idx = offsets.partition_point(|&o| o <= flat_usize) - 1;
2439 let def_idx = flat_usize - offsets[file_idx];
2440 #[expect(clippy::cast_possible_truncation)]
2441 (file_idx as u32, def_idx as u16)
2442 };
2443
2444 let callers = incoming
2445 .into_iter()
2446 .map(|mut v| {
2447 v.sort_by_key(|b| std::cmp::Reverse(b.1));
2448 v.truncate(MAX_NEIGHBORS);
2449 v.into_iter().map(|(idx, _)| to_def_id(idx)).collect()
2450 })
2451 .collect();
2452
2453 let callees = outgoing
2454 .into_iter()
2455 .map(|mut v| {
2456 v.sort_by_key(|b| std::cmp::Reverse(b.1));
2457 v.truncate(MAX_NEIGHBORS);
2458 v.into_iter().map(|(idx, _)| to_def_id(idx)).collect()
2459 })
2460 .collect();
2461
2462 (callers, callees)
2463}
2464
2465// ── PageRank ─────────────────────────────────────────────────────────
2466
2467/// Compute `PageRank` scores for a graph.
2468///
2469/// If `focus` is `Some(idx)`, computes topic-sensitive `PageRank` biased
2470/// toward file `idx`. Otherwise computes standard (uniform) `PageRank`.
2471///
2472/// Returns one score per node, summing to 1.0.
2473#[expect(
2474 clippy::cast_precision_loss,
2475 reason = "node count fits comfortably in f32"
2476)]
2477fn pagerank(n: usize, edges: &[(u32, u32, u32)], focus: Option<usize>) -> Vec<f32> {
2478 if n == 0 {
2479 return vec![];
2480 }
2481
2482 // Build adjacency: out_edges[src] = [(dst, weight)]
2483 let mut out_edges: Vec<Vec<(usize, f32)>> = vec![vec![]; n];
2484 let mut out_weight: Vec<f32> = vec![0.0; n];
2485
2486 for &(src, dst, w) in edges {
2487 let (s, d) = (src as usize, dst as usize);
2488 if s < n && d < n {
2489 #[expect(clippy::cast_possible_truncation, reason = "edge weights are small")]
2490 let wf = f64::from(w) as f32;
2491 out_edges[s].push((d, wf));
2492 out_weight[s] += wf;
2493 }
2494 }
2495
2496 // Personalization vector (Haveliwala 2002, topic-sensitive PageRank).
2497 //
2498 // When a focus file is specified, the teleportation distribution is split:
2499 // - PERSONALIZATION_ALPHA (0.15) concentrated on the focus node.
2500 // - (1 - PERSONALIZATION_ALPHA) = 0.85 spread uniformly over the
2501 // remaining (n - 1) other nodes.
2502 //
2503 // This gives the focus file a gentle bias over its neighbors without
2504 // collapsing every other file to an equal uniform floor. The resulting
2505 // ranks still vary across the corpus, so the caller sees a *neighborhood*
2506 // of semantically related files rebiased toward the focus (I#16 fix).
2507 //
2508 // For n == 1 there are no other nodes; the focus gets all mass (= 1.0).
2509 let bias: Vec<f32> = if let Some(idx) = focus {
2510 if n == 1 {
2511 vec![1.0_f32]
2512 } else {
2513 let other_mass = (1.0_f32 - PERSONALIZATION_ALPHA) / (n as f32 - 1.0);
2514 let mut b = vec![other_mass; n];
2515 if idx < n {
2516 b[idx] = PERSONALIZATION_ALPHA;
2517 }
2518 // Verify sum ≈ 1.0 (should hold by construction; normalization
2519 // guards against floating-point drift on very large graphs).
2520 let sum: f32 = b.iter().sum();
2521 for v in &mut b {
2522 *v /= sum;
2523 }
2524 b
2525 }
2526 } else {
2527 vec![1.0 / n as f32; n]
2528 };
2529
2530 let mut rank = vec![1.0 / n as f32; n];
2531 let mut next_rank = vec![0.0_f32; n];
2532
2533 for _ in 0..MAX_ITERATIONS {
2534 // Collect dangling mass (nodes with no outgoing edges)
2535 let dangling: f32 = rank
2536 .iter()
2537 .enumerate()
2538 .filter(|&(i, _)| out_edges[i].is_empty())
2539 .map(|(_, &r)| r)
2540 .sum();
2541
2542 // Distribute rank
2543 for (i, nr) in next_rank.iter_mut().enumerate() {
2544 *nr = (1.0 - DAMPING).mul_add(bias[i], DAMPING * dangling * bias[i]);
2545 }
2546
2547 for (src, edges_list) in out_edges.iter().enumerate() {
2548 if edges_list.is_empty() {
2549 continue;
2550 }
2551 let src_rank = rank[src];
2552 let total_w = out_weight[src];
2553 for &(dst, w) in edges_list {
2554 next_rank[dst] += DAMPING * src_rank * (w / total_w);
2555 }
2556 }
2557
2558 // Check convergence
2559 let diff: f32 = rank
2560 .iter()
2561 .zip(next_rank.iter())
2562 .map(|(a, b)| (a - b).abs())
2563 .sum();
2564
2565 std::mem::swap(&mut rank, &mut next_rank);
2566
2567 if diff < EPSILON {
2568 break;
2569 }
2570 }
2571
2572 rank
2573}
2574
2575// ── Graph Building ───────────────────────────────────────────────────
2576
2577/// Intermediate result from definition-level graph computation.
2578struct DefGraphData {
2579 def_edges: Vec<(DefId, DefId, u32)>,
2580 def_ranks: Vec<f32>,
2581 def_callers: Vec<Vec<DefId>>,
2582 def_callees: Vec<Vec<DefId>>,
2583 offsets: Vec<usize>,
2584 base_ranks: Vec<f32>,
2585 file_edges: Vec<(u32, u32, u32)>,
2586}
2587
2588/// Build bidirectional trait↔impl method edges for PageRank propagation (G3).
2589///
2590/// For every impl method that overrides a trait method, adds:
2591/// - `(impl_def_id, trait_def_id, 1)` — impl → trait
2592/// - `(trait_def_id, impl_def_id, 1)` — trait → impl
2593///
2594/// Detection heuristic: an impl method "overrides" a trait method when:
2595/// - The impl method's kind is `"function_item"` and its `scope` starts with
2596/// `"impl_item"`.
2597/// - The trait method's kind is `"function_signature_item"` and its `scope`
2598/// starts with `"trait_item"`.
2599/// - Both have the same `name`.
2600/// - The impl's file imports the trait's file (or they share a file).
2601///
2602/// This is heuristic, not sound: it may produce false positives when two
2603/// unrelated traits define methods with the same name. The practical false-
2604/// positive rate on real Rust codebases is low because method names are
2605/// usually unique within a crate.
2606#[must_use]
2607pub fn build_trait_impl_edges_pub(files: &[FileNode]) -> Vec<(DefId, DefId, u32)> {
2608 build_trait_impl_edges(files)
2609}
2610
2611fn build_trait_impl_edges(files: &[FileNode]) -> Vec<(DefId, DefId, u32)> {
2612 // Build index: method_name → list of (DefId, is_trait_method).
2613 // trait method: kind == "function_signature_item" (abstract) OR scope contains "trait_item".
2614 // impl method: kind == "function_item" AND scope contains "impl_item".
2615 let mut trait_methods: HashMap<String, Vec<DefId>> = HashMap::new();
2616 let mut impl_methods: HashMap<String, Vec<DefId>> = HashMap::new();
2617
2618 for (fi, file) in files.iter().enumerate() {
2619 for (di, def) in file.defs.iter().enumerate() {
2620 #[expect(clippy::cast_possible_truncation)]
2621 let did: DefId = (fi as u32, di as u16);
2622 if def.kind == "function_signature_item"
2623 || (def.scope.starts_with("trait_item") && def.kind == "function_item")
2624 {
2625 trait_methods.entry(def.name.clone()).or_default().push(did);
2626 } else if def.kind == "function_item" && def.scope.starts_with("impl_item") {
2627 impl_methods.entry(def.name.clone()).or_default().push(did);
2628 }
2629 }
2630 }
2631
2632 // Pre-build imported-files sets to restrict matching.
2633 let imported_sets: Vec<std::collections::HashSet<u32>> = files
2634 .iter()
2635 .map(|f| {
2636 f.imports
2637 .iter()
2638 .filter_map(|imp| imp.resolved_idx)
2639 .collect()
2640 })
2641 .collect();
2642
2643 let mut edges: Vec<(DefId, DefId, u32)> = Vec::new();
2644
2645 for (name, trait_defs) in &trait_methods {
2646 let Some(impl_defs) = impl_methods.get(name) else {
2647 continue;
2648 };
2649 for &(tf, td) in trait_defs {
2650 for &(imf, imd) in impl_defs {
2651 // The impl file must import the trait file (or be the same file).
2652 let connected = tf == imf
2653 || imported_sets
2654 .get(imf as usize)
2655 .is_some_and(|s| s.contains(&tf));
2656 if connected {
2657 let trait_id: DefId = (tf, td);
2658 let impl_id: DefId = (imf, imd);
2659 edges.push((trait_id, impl_id, 1));
2660 edges.push((impl_id, trait_id, 1));
2661 }
2662 }
2663 }
2664 }
2665
2666 edges
2667}
2668
2669/// Build definition-level edges, compute `PageRank`, and derive file-level data.
2670fn compute_def_graph(files: &[FileNode]) -> DefGraphData {
2671 // Build definition-level edge list from resolved calls
2672 let mut def_edge_map: HashMap<(DefId, DefId), u32> = HashMap::new();
2673 for (file_idx, file) in files.iter().enumerate() {
2674 for (def_idx, def) in file.defs.iter().enumerate() {
2675 #[expect(clippy::cast_possible_truncation)]
2676 let caller_id: DefId = (file_idx as u32, def_idx as u16);
2677 for call in &def.calls {
2678 if let Some(callee_id) = call.resolved {
2679 *def_edge_map.entry((caller_id, callee_id)).or_insert(0) += 1;
2680 }
2681 }
2682 }
2683 }
2684
2685 // Add trait↔impl bidirectional edges (G3).
2686 let trait_impl_edges = build_trait_impl_edges(files);
2687 for (src, dst, w) in trait_impl_edges {
2688 *def_edge_map.entry((src, dst)).or_insert(0) += w;
2689 }
2690
2691 let def_edges: Vec<(DefId, DefId, u32)> = def_edge_map
2692 .into_iter()
2693 .map(|((src, dst), w)| (src, dst, w))
2694 .collect();
2695
2696 // Compute def-level PageRank
2697 let offsets = def_offsets(files);
2698 let n_defs = *offsets.last().unwrap_or(&0);
2699
2700 let flat_def_edges: Vec<(u32, u32, u32)> = def_edges
2701 .iter()
2702 .map(|(src, dst, w)| {
2703 #[expect(clippy::cast_possible_truncation)]
2704 (
2705 flatten_def_id(&offsets, *src) as u32,
2706 flatten_def_id(&offsets, *dst) as u32,
2707 *w,
2708 )
2709 })
2710 .collect();
2711
2712 let def_ranks = pagerank(n_defs, &flat_def_edges, None);
2713
2714 // Aggregate def ranks to file level
2715 let base_ranks: Vec<f32> = files
2716 .iter()
2717 .enumerate()
2718 .map(|(i, file)| {
2719 let start = offsets[i];
2720 let end = start + file.defs.len();
2721 def_ranks[start..end].iter().sum()
2722 })
2723 .collect();
2724
2725 // Derive file-level edges from def-level call edges
2726 let mut file_edge_map: HashMap<(u32, u32), u32> = HashMap::new();
2727 for &(src, dst, w) in &def_edges {
2728 let src_file = src.0;
2729 let dst_file = dst.0;
2730 if src_file != dst_file {
2731 *file_edge_map.entry((src_file, dst_file)).or_insert(0) += w;
2732 }
2733 }
2734 let file_edges: Vec<(u32, u32, u32)> = file_edge_map
2735 .into_iter()
2736 .map(|((src, dst), w)| (src, dst, w))
2737 .collect();
2738
2739 // Build def-level caller/callee lists
2740 let (def_callers, def_callees) = build_def_neighbor_lists(n_defs, &flat_def_edges, &offsets);
2741
2742 DefGraphData {
2743 def_edges,
2744 def_ranks,
2745 def_callers,
2746 def_callees,
2747 offsets,
2748 base_ranks,
2749 file_edges,
2750 }
2751}
2752
2753/// Build a dependency graph from a repository root.
2754///
2755/// Walks the directory tree, parses each supported file with tree-sitter,
2756/// extracts definitions and imports, resolves import paths to files, runs
2757/// `PageRank`, and builds caller/callee lists.
2758///
2759/// # Errors
2760///
2761/// Returns an error if file walking or reading fails.
2762#[expect(
2763 clippy::too_many_lines,
2764 reason = "three-phase parallel pipeline (walk+filter, def+import extraction, call extraction) \
2765 plus resolve + graph build; phases share state (file_index, raw_sources) and \
2766 cannot be meaningfully split without passing large mutable structures across \
2767 boundaries with no clarity gain"
2768)]
2769pub fn build_graph(root: &Path) -> crate::Result<RepoGraph> {
2770 let root = root.canonicalize().map_err(|e| crate::Error::Io {
2771 path: root.display().to_string(),
2772 source: e,
2773 })?;
2774
2775 let mut walk_options = walk::WalkOptions::default();
2776 if let Some((_, config)) = crate::cache::config::find_config(&root) {
2777 walk_options.ignore_patterns = config.ignore.patterns;
2778 }
2779 let all_files = walk::collect_files_with_options(&root, &walk_options);
2780
2781 // Phase 1: parallel filter + read. For each candidate path with a
2782 // supported extension, read its source from disk and emit a tuple
2783 // alongside its relative path. rayon spreads the I/O cost across
2784 // worker threads; on a 1M-file corpus this was ~20s sequential and
2785 // now sits in the 2-3s range bounded by disk + filter throughput.
2786 let raw_inputs: Vec<(PathBuf, String, String, String)> = all_files
2787 .par_iter()
2788 .filter_map(|path| {
2789 let ext = path
2790 .extension()
2791 .and_then(|e| e.to_str())
2792 .unwrap_or_default()
2793 .to_string();
2794 if languages::config_for_extension(&ext).is_none()
2795 && import_query_for_extension(&ext).is_none()
2796 {
2797 return None;
2798 }
2799 let source = std::fs::read_to_string(path).ok()?;
2800 let rel_path = path
2801 .strip_prefix(&root)
2802 .unwrap_or(path)
2803 .display()
2804 .to_string();
2805 Some((path.clone(), rel_path, ext, source))
2806 })
2807 .collect();
2808
2809 // Build the contiguous `files` Vec and the absolute-path -> idx
2810 // lookup. Sequential because both want stable indices that match
2811 // `raw_sources`'s order; the per-file work this gates is trivial.
2812 let mut file_index: HashMap<PathBuf, usize> = HashMap::with_capacity(raw_inputs.len());
2813 let mut files: Vec<FileNode> = Vec::with_capacity(raw_inputs.len());
2814 let mut raw_sources: Vec<(usize, String, String)> = Vec::with_capacity(raw_inputs.len());
2815 for (idx, (abs_path, rel_path, ext, source)) in raw_inputs.into_iter().enumerate() {
2816 file_index.insert(abs_path, idx);
2817 files.push(FileNode {
2818 path: rel_path,
2819 defs: vec![],
2820 imports: vec![],
2821 });
2822 raw_sources.push((idx, ext, source));
2823 }
2824
2825 // Phase 2: parallel per-file definition + import extraction. Each
2826 // file's tree-sitter parse + def/import queries are independent;
2827 // par_iter_mut over files.iter_mut().zip(raw_sources.par_iter())
2828 // lets every rayon worker grind its own slice. The closures here
2829 // borrow `&root` and `&file_index` immutably (both Sync) and write
2830 // disjoint `FileNode` slots via the &mut iterator.
2831 files
2832 .par_iter_mut()
2833 .zip(raw_sources.par_iter())
2834 .for_each(|(file, (_, ext, source))| {
2835 if let Some(config) = languages::config_for_extension(ext) {
2836 file.defs = extract_definitions(source, &config);
2837 // Go method_declaration scopes are empty after the generic
2838 // extract_definitions pass (the method is a top-level node
2839 // with no structural parent in CONTAINER_KINDS). Enrich them
2840 // with the receiver type so that resolve_calls Priority 2
2841 // (scope.contains(recv_type)) fires correctly for cross-file
2842 // Go receiver-method calls. This populates def_callers[] for
2843 // Go in compute_def_graph (P1 fix).
2844 if languages::is_go_language(&config.language) {
2845 enrich_go_method_def_scopes(source, &mut file.defs);
2846 }
2847 // SQL: prepend a synthetic file-level def named after the
2848 // filename stem (dbt/sqlmesh convention). The whole-file
2849 // byte range becomes the smallest-enclosing fallback for
2850 // FROM/JOIN call-edges that are not inside any CTE, which
2851 // is the resolution target for cross-model references
2852 // (S1, Wave 4). file.path is relative to the repo root and
2853 // is what file_stem() needs to derive the model name.
2854 if languages::is_sql_language(&config.language) {
2855 enrich_sql_file_def(&file.path, source, &mut file.defs);
2856 }
2857 }
2858 if let Some((lang, import_query)) = import_query_for_extension(ext) {
2859 let raw_imports = extract_imports(source, &lang, &import_query);
2860 let file_path = root.join(&file.path);
2861 file.imports = raw_imports
2862 .into_iter()
2863 .map(|raw| {
2864 let resolved_idx =
2865 resolve_import(&raw, ext, &file_path, &root, &file_index)
2866 .and_then(|i| u32::try_from(i).ok());
2867 ImportRef {
2868 raw_path: raw,
2869 resolved_idx,
2870 }
2871 })
2872 .collect();
2873 }
2874 });
2875
2876 // Phase 3: parallel per-file call extraction. Mutates each
2877 // FileNode's `defs[*].calls` independently. Aligned with
2878 // raw_sources by index via the zip.
2879 files
2880 .par_iter_mut()
2881 .zip(raw_sources.par_iter())
2882 .for_each(|(file, (_, ext, source))| {
2883 if let Some(call_config) = languages::call_query_for_extension(ext) {
2884 extract_calls(source, &call_config, &mut file.defs);
2885 }
2886 });
2887
2888 // Build the Python class hierarchy (class_name → parent class names) by
2889 // walking every Python source file. The map is used by `resolve_calls`
2890 // Priority 2.5 to dispatch `self.method()` calls through the MRO when
2891 // the method lives on a parent / mixin class (Q1, Wave 2).
2892 //
2893 // Parallel: extract_python_class_hierarchy is pure per-file, then we
2894 // fold the per-file maps into one global map sequentially because
2895 // HashMap is not lock-free. On a 1k-Python-file corpus this fold takes
2896 // <10ms — much smaller than the parallel parse work that feeds it.
2897 let python_hierarchies: Vec<HashMap<String, Vec<String>>> = raw_sources
2898 .par_iter()
2899 .map(|(_, ext, source)| {
2900 if ext == "py" || ext == "pyi" {
2901 extract_python_class_hierarchy(source)
2902 } else {
2903 HashMap::new()
2904 }
2905 })
2906 .collect();
2907 let mut python_class_hierarchy: HashMap<String, Vec<String>> = HashMap::new();
2908 for local in python_hierarchies {
2909 for (k, v) in local {
2910 // First declaration wins on name collisions across files. The
2911 // MRO walk only needs a plausible parent chain to find an
2912 // ancestor's methods; this is conservative but acceptable.
2913 python_class_hierarchy.entry(k).or_insert(v);
2914 }
2915 }
2916
2917 // Resolve call references to target definitions
2918 let def_index = build_def_index(&files);
2919 resolve_calls(&mut files, &def_index, &python_class_hierarchy);
2920
2921 // Build def-level graph, compute PageRank, and derive file-level data
2922 let graph_data = compute_def_graph(&files);
2923
2924 // Build file-level caller/callee lists
2925 let n = files.len();
2926 let (callers, callees) = build_neighbor_lists(n, &graph_data.file_edges);
2927
2928 // Auto-tune alpha based on graph density
2929 #[expect(clippy::cast_precision_loss, reason = "graph sizes fit in f32")]
2930 let density = if n > 1 {
2931 graph_data.file_edges.len() as f32 / (n as f32 * (n as f32 - 1.0))
2932 } else {
2933 0.0
2934 };
2935 let alpha = 0.3f32.mul_add(density.min(1.0), 0.5);
2936
2937 Ok(RepoGraph {
2938 files,
2939 edges: graph_data.file_edges,
2940 base_ranks: graph_data.base_ranks,
2941 callers,
2942 callees,
2943 def_edges: graph_data.def_edges,
2944 def_ranks: graph_data.def_ranks,
2945 def_callers: graph_data.def_callers,
2946 def_callees: graph_data.def_callees,
2947 def_offsets: graph_data.offsets,
2948 alpha,
2949 })
2950}
2951
2952/// Build a `RepoGraph` directly from a pre-constructed `Vec<FileNode>`.
2953///
2954/// Skips the filesystem walk phase of [`build_graph`]; useful for integration
2955/// tests that want to build synthetic graphs without touching disk.
2956///
2957/// Resolves calls, builds the def-level graph (including G3 trait↔impl edges),
2958/// computes `PageRank`, and builds caller/callee lists.
2959#[must_use]
2960pub fn build_graph_from_files_pub(mut files: Vec<FileNode>) -> RepoGraph {
2961 let def_index = build_def_index(&files);
2962 // No source is available at this entry point, so the Python class
2963 // hierarchy is empty and the MRO walk (Priority 2.5) is a no-op.
2964 // Tests that need MRO resolution should drive `resolve_calls_with_python_mro_pub`
2965 // directly and then call `build_graph_from_files_pub` for the rest of the pipeline.
2966 let empty_hierarchy: HashMap<String, Vec<String>> = HashMap::new();
2967 resolve_calls(&mut files, &def_index, &empty_hierarchy);
2968 let graph_data = compute_def_graph(&files);
2969 let n = files.len();
2970 let (callers, callees) = build_neighbor_lists(n, &graph_data.file_edges);
2971
2972 #[expect(clippy::cast_precision_loss, reason = "graph sizes fit in f32")]
2973 let density = if n > 1 {
2974 graph_data.file_edges.len() as f32 / (n as f32 * (n as f32 - 1.0))
2975 } else {
2976 0.0
2977 };
2978 let alpha = 0.3f32.mul_add(density.min(1.0), 0.5);
2979
2980 RepoGraph {
2981 files,
2982 edges: graph_data.file_edges,
2983 base_ranks: graph_data.base_ranks,
2984 callers,
2985 callees,
2986 def_edges: graph_data.def_edges,
2987 def_ranks: graph_data.def_ranks,
2988 def_callers: graph_data.def_callers,
2989 def_callees: graph_data.def_callees,
2990 def_offsets: graph_data.offsets,
2991 alpha,
2992 }
2993}
2994
2995// ── Dead-code analysis ───────────────────────────────────────────────────────
2996
2997/// Global flat definition index: `def_offsets[file_idx] + def_idx_within_file`.
2998///
2999/// This is the natural index into [`RepoGraph::def_ranks`],
3000/// [`RepoGraph::def_callers`], and [`RepoGraph::def_callees`].
3001pub type DefIndex = usize;
3002
3003/// A connected component in the dead-code subgraph.
3004///
3005/// All members are definitions that are unreachable from any entry point.
3006/// The component is formed by treating `def_callees + def_callers` edges as
3007/// undirected — so a transitively-dead group (including mutual-recursion
3008/// cycles that are collectively unreachable) surfaces as one cluster.
3009#[derive(Debug, Clone)]
3010pub struct DeadCluster {
3011 /// Global flat def index of the cluster root (the member with the
3012 /// highest [`RepoGraph::def_ranks`] score).
3013 pub root_def_idx: usize,
3014 /// Number of definitions in this cluster.
3015 pub size: usize,
3016 /// Sum of `end_line - start_line` for every member definition.
3017 pub total_lines: usize,
3018 /// All member global def indices, root first.
3019 pub member_def_indices: Vec<usize>,
3020}
3021
3022/// Summary report from [`compute_dead_code`].
3023///
3024/// The primary consumer is X3's `mcp__ripvec__find_dead_code` MCP tool.
3025#[derive(Debug, Clone)]
3026pub struct DeadCodeReport {
3027 /// Dead clusters sorted by size descending (largest first).
3028 pub dead_clusters: Vec<DeadCluster>,
3029 /// Total number of definitions unreachable from any entry point.
3030 pub total_dead_defs: usize,
3031 /// Total number of definitions reachable from at least one entry point.
3032 pub total_live_defs: usize,
3033 /// Fraction of all definitions that are dead: `dead / (dead + live)`.
3034 pub dead_fraction: f32,
3035}
3036
3037/// Returns true if the given path is a test-related path.
3038///
3039/// Heuristic: the path contains `tests/`, `/spec/`, `/specs/`, or the
3040/// file stem starts/ends with `test_`/`_test` or contains `bench`.
3041fn is_test_path(path: &str) -> bool {
3042 let path_lc = path.to_lowercase();
3043 if path_lc.contains("tests/") || path_lc.contains("/spec/") || path_lc.contains("/specs/") {
3044 return true;
3045 }
3046 let file_name = path.rsplit('/').next().unwrap_or(path);
3047 let stem = file_name.split('.').next().unwrap_or(file_name);
3048 stem.starts_with("test_") || stem.ends_with("_test") || stem.contains("bench")
3049}
3050
3051/// Resolve a flat [`DefIndex`] to its file index using the `def_offsets`
3052/// prefix-sum table.
3053fn flat_to_file_idx(offsets: &[usize], flat: DefIndex) -> usize {
3054 offsets.partition_point(|&o| o <= flat).saturating_sub(1)
3055}
3056
3057/// Union-find: path-compressing find. Returns the root representative of `x`.
3058fn uf_find(parent: &mut Vec<usize>, x: usize) -> usize {
3059 if parent[x] != x {
3060 parent[x] = uf_find(parent, parent[x]);
3061 }
3062 parent[x]
3063}
3064
3065/// Union-find: merge the components containing `x` and `y`.
3066fn uf_union(parent: &mut Vec<usize>, x: usize, y: usize) {
3067 let rx = uf_find(parent, x);
3068 let ry = uf_find(parent, y);
3069 if rx != ry {
3070 parent[rx] = ry;
3071 }
3072}
3073
3074/// Compute the set of definitions unreachable from any entry point.
3075///
3076/// Returns dead clusters (connected components in the unreachable subgraph),
3077/// sorted by size descending.
3078///
3079/// # Algorithm
3080///
3081/// 1. Optionally filter test paths from `entry_def_indices` (when
3082/// `include_test_paths` is `false`). Test-path heuristic: the file path
3083/// contains `test_`, `_test`, `tests/`, `spec/`, `specs/`, or `bench`.
3084/// 2. BFS forward over [`RepoGraph::def_callees`] from the entry seeds ->
3085/// reachable set.
3086/// 3. Complement `(all_defs - reachable)` = dead set.
3087/// 4. Connected-components on the dead subgraph via
3088/// `def_callees + def_callers` treated as undirected (union-find).
3089/// 5. For each cluster: pick the highest-rank def as the cluster root;
3090/// `size` = member count; `total_lines` = sum of `(end_line -
3091/// start_line)` per member.
3092/// 6. Sort clusters by size descending.
3093#[must_use]
3094#[expect(
3095 clippy::too_many_lines,
3096 reason = "six-step BFS+clustering pipeline; splitting into sub-functions \
3097 would require passing many interdependent Vec borrows with no \
3098 clarity gain"
3099)]
3100pub fn compute_dead_code<S: std::hash::BuildHasher>(
3101 graph: &RepoGraph,
3102 entry_def_indices: &HashSet<DefIndex, S>,
3103 include_test_paths: bool,
3104) -> DeadCodeReport {
3105 let n_defs = graph.def_ranks.len();
3106 if n_defs == 0 {
3107 return DeadCodeReport {
3108 dead_clusters: vec![],
3109 total_dead_defs: 0,
3110 total_live_defs: 0,
3111 dead_fraction: 0.0,
3112 };
3113 }
3114
3115 // Step 1: build the effective seed set.
3116 let seeds: Vec<DefIndex> = if include_test_paths {
3117 entry_def_indices.iter().copied().collect()
3118 } else {
3119 entry_def_indices
3120 .iter()
3121 .copied()
3122 .filter(|&flat| {
3123 let file_idx = flat_to_file_idx(&graph.def_offsets, flat);
3124 let path = graph
3125 .files
3126 .get(file_idx)
3127 .map(|f| f.path.as_str())
3128 .unwrap_or("");
3129 !is_test_path(path)
3130 })
3131 .collect()
3132 };
3133
3134 // Step 2: BFS forward over def_callees from seeds -> reachable set.
3135 let mut reachable: Vec<bool> = vec![false; n_defs];
3136 let mut queue: std::collections::VecDeque<DefIndex> = std::collections::VecDeque::new();
3137
3138 for seed in &seeds {
3139 if *seed < n_defs && !reachable[*seed] {
3140 reachable[*seed] = true;
3141 queue.push_back(*seed);
3142 }
3143 }
3144
3145 while let Some(flat) = queue.pop_front() {
3146 for callee_did in &graph.def_callees[flat] {
3147 let callee_flat = graph.def_offsets[callee_did.0 as usize] + callee_did.1 as usize;
3148 if callee_flat < n_defs && !reachable[callee_flat] {
3149 reachable[callee_flat] = true;
3150 queue.push_back(callee_flat);
3151 }
3152 }
3153 }
3154
3155 // Step 3: dead set = all_defs - reachable.
3156 let dead: Vec<DefIndex> = (0..n_defs).filter(|&i| !reachable[i]).collect();
3157 let total_dead_defs = dead.len();
3158 let total_live_defs = n_defs - total_dead_defs;
3159
3160 if dead.is_empty() {
3161 return DeadCodeReport {
3162 dead_clusters: vec![],
3163 total_dead_defs: 0,
3164 total_live_defs,
3165 dead_fraction: 0.0,
3166 };
3167 }
3168
3169 // Step 4: connected components on the dead subgraph via union-find.
3170 let dead_set: HashSet<DefIndex> = dead.iter().copied().collect();
3171 let dead_pos: HashMap<DefIndex, usize> = dead
3172 .iter()
3173 .copied()
3174 .enumerate()
3175 .map(|(pos, idx)| (idx, pos))
3176 .collect();
3177 let m = dead.len();
3178 let mut parent: Vec<usize> = (0..m).collect();
3179
3180 for &flat in &dead {
3181 let pos_flat = dead_pos[&flat];
3182 for callee_did in &graph.def_callees[flat] {
3183 let callee_flat = graph.def_offsets[callee_did.0 as usize] + callee_did.1 as usize;
3184 if dead_set.contains(&callee_flat) {
3185 let pos_callee = dead_pos[&callee_flat];
3186 uf_union(&mut parent, pos_flat, pos_callee);
3187 }
3188 }
3189 for caller_did in &graph.def_callers[flat] {
3190 let caller_flat = graph.def_offsets[caller_did.0 as usize] + caller_did.1 as usize;
3191 if dead_set.contains(&caller_flat) {
3192 let pos_caller = dead_pos[&caller_flat];
3193 uf_union(&mut parent, pos_flat, pos_caller);
3194 }
3195 }
3196 }
3197
3198 // Flatten roots (path compression for all).
3199 for i in 0..m {
3200 uf_find(&mut parent, i);
3201 }
3202
3203 // Group members by their component root.
3204 let mut components: HashMap<usize, Vec<DefIndex>> = HashMap::new();
3205 for (pos, &flat) in dead.iter().enumerate() {
3206 let root_pos = parent[pos];
3207 components.entry(root_pos).or_default().push(flat);
3208 }
3209
3210 // Step 5: build clusters - root is the highest-rank member.
3211 let mut clusters: Vec<DeadCluster> = components
3212 .into_values()
3213 .map(|members| {
3214 let root_flat = members
3215 .iter()
3216 .copied()
3217 .max_by(|&a, &b| {
3218 let ra = graph.def_ranks.get(a).copied().unwrap_or(0.0);
3219 let rb = graph.def_ranks.get(b).copied().unwrap_or(0.0);
3220 ra.total_cmp(&rb)
3221 })
3222 .unwrap_or(members[0]);
3223
3224 let total_lines: usize = members
3225 .iter()
3226 .copied()
3227 .map(|flat| {
3228 let file_idx = flat_to_file_idx(&graph.def_offsets, flat);
3229 let def_idx = flat - graph.def_offsets[file_idx];
3230 let def = graph.files.get(file_idx).and_then(|f| f.defs.get(def_idx));
3231 def.map(|d| (d.end_line as usize).saturating_sub(d.start_line as usize))
3232 .unwrap_or(0)
3233 })
3234 .sum();
3235
3236 let mut member_def_indices: Vec<usize> = std::iter::once(root_flat)
3237 .chain(members.iter().copied().filter(|&m| m != root_flat))
3238 .collect();
3239 if member_def_indices.len() > 1 {
3240 member_def_indices[1..].sort_unstable();
3241 }
3242
3243 DeadCluster {
3244 root_def_idx: root_flat,
3245 size: member_def_indices.len(),
3246 total_lines,
3247 member_def_indices,
3248 }
3249 })
3250 .collect();
3251
3252 // Step 6: sort by size descending.
3253 clusters.sort_by(|a, b| {
3254 b.size
3255 .cmp(&a.size)
3256 .then(b.root_def_idx.cmp(&a.root_def_idx))
3257 });
3258
3259 #[expect(
3260 clippy::cast_precision_loss,
3261 reason = "def counts fit comfortably in f32"
3262 )]
3263 let dead_fraction = if n_defs > 0 {
3264 total_dead_defs as f32 / n_defs as f32
3265 } else {
3266 0.0
3267 };
3268
3269 DeadCodeReport {
3270 dead_clusters: clusters,
3271 total_dead_defs,
3272 total_live_defs,
3273 dead_fraction,
3274 }
3275}
3276
3277impl RepoGraph {
3278 /// Get the `PageRank` score for a specific definition.
3279 #[must_use]
3280 pub fn def_rank(&self, did: DefId) -> f32 {
3281 let flat = self.def_offsets[did.0 as usize] + did.1 as usize;
3282 self.def_ranks.get(flat).copied().unwrap_or(0.0)
3283 }
3284
3285 /// Look up a definition by file path and name. Returns the first match.
3286 #[must_use]
3287 pub fn find_def(&self, file_path: &str, def_name: &str) -> Option<DefId> {
3288 for (file_idx, file) in self.files.iter().enumerate() {
3289 if file.path == file_path {
3290 for (def_idx, def) in file.defs.iter().enumerate() {
3291 if def.name == def_name {
3292 #[expect(clippy::cast_possible_truncation)]
3293 return Some((file_idx as u32, def_idx as u16));
3294 }
3295 }
3296 }
3297 }
3298 None
3299 }
3300
3301 /// Resolve a caller-supplied `focus_file` string to a file index in [`Self::files`].
3302 ///
3303 /// Accepts any of the path forms that ripvec itself emits or accepts:
3304 ///
3305 /// - **Exact stored path** (`device_opt/services/storage.py`) — direct match.
3306 /// - **LSP-shaped path** (`./device_opt/services/storage.py`) — the `./`
3307 /// prefix used by every [`RepoMapLspLocation::file_path`] is stripped
3308 /// before comparison so the documented chaining pattern
3309 /// `get_repo_map(focus_file=hits[0].lsp_location.file_path)` works.
3310 /// - **Strict suffix** (`storage.py`, `services/storage.py`) — match when
3311 /// the previous character in the stored path is `/`. Avoids matching
3312 /// `foo_storage.py` for `storage.py`.
3313 ///
3314 /// Returns [`FocusResolution::Found`] when exactly one file matches,
3315 /// [`FocusResolution::Ambiguous`] when multiple files match (the caller
3316 /// surfaces the candidate list to the user), and [`FocusResolution::NotFound`]
3317 /// when no file matches.
3318 ///
3319 /// # Background
3320 ///
3321 /// Prior to this helper the MCP layer (`crates/ripvec-mcp/src/tools.rs`)
3322 /// did the matching inline with two bugs:
3323 ///
3324 /// 1. **`./` prefix mismatch.** [`RepoMapLspLocation::file_path`] always
3325 /// carries a leading `./` (see [`file_lsp_location`]), but
3326 /// [`FileNode::path`] does not. Passing the LSP location verbatim as
3327 /// `focus_file` matched zero files. The matcher silently returned
3328 /// `focus = None`, producing rank values bit-identical to the unfocused
3329 /// call — the bug originally reported as "I#20 focus_file rebias
3330 /// invisible on Python".
3331 /// 2. **Equal-length false negative.** When the user passed
3332 /// `./device_opt/services/storage.py` and the stored path was
3333 /// `device_opt/services/storage.py`, `exact` was false (the strings
3334 /// differ by two bytes) and `strict_suffix` was false (the focus is
3335 /// longer than the stored path, so `p.len() > focus.len()` fails). The
3336 /// pathology surfaced specifically when the focus was a *full* path
3337 /// with the LSP `./` prefix.
3338 ///
3339 /// Centralising the resolution here gives every caller the same
3340 /// normalization-tolerant semantics and one place to test the contract.
3341 #[must_use]
3342 pub fn resolve_focus_file(&self, focus: &str) -> FocusResolution {
3343 let normalized = normalize_focus_path(focus);
3344 let matches: Vec<usize> = self
3345 .files
3346 .iter()
3347 .enumerate()
3348 .filter_map(|(idx, f)| {
3349 if focus_matches_path(&f.path, normalized) {
3350 Some(idx)
3351 } else {
3352 None
3353 }
3354 })
3355 .collect();
3356 match matches.len() {
3357 0 => FocusResolution::NotFound,
3358 1 => FocusResolution::Found(matches[0]),
3359 _ => FocusResolution::Ambiguous(
3360 matches
3361 .into_iter()
3362 .map(|i| self.files[i].path.clone())
3363 .collect(),
3364 ),
3365 }
3366 }
3367}
3368
3369/// Result of resolving a user-supplied `focus_file` string against a [`RepoGraph`].
3370///
3371/// See [`RepoGraph::resolve_focus_file`] for the resolution semantics and the
3372/// historical bug that motivated the helper.
3373#[derive(Debug, Clone)]
3374pub enum FocusResolution {
3375 /// Exactly one file matched. Carries the file index in [`RepoGraph::files`].
3376 Found(usize),
3377 /// No file matched. The caller treats this as an unfocused call.
3378 NotFound,
3379 /// Two or more files matched. The caller surfaces the candidate list so
3380 /// the user can disambiguate by passing a longer suffix or the full path.
3381 Ambiguous(Vec<String>),
3382}
3383
3384/// Strip the leading `./` prefix from a focus_file path.
3385///
3386/// The `./` form is produced by [`file_lsp_location`] for every
3387/// [`RepoMapLspLocation::file_path`] field on a relative path. Stripping it
3388/// gives a stored-path-shaped value for the suffix matcher to compare
3389/// against [`FileNode::path`] entries (which do not carry the prefix).
3390///
3391/// Absolute paths (`/abs/path/file.py`) are returned unchanged; they will
3392/// fail the suffix match against the relative stored paths, which is the
3393/// correct behavior (the caller meant a different root entirely).
3394fn normalize_focus_path(focus: &str) -> &str {
3395 focus.strip_prefix("./").unwrap_or(focus)
3396}
3397
3398/// Return true when `focus` matches `stored_path` as either an exact path or
3399/// a strict-suffix (must be preceded by `/`). The empty focus does not match.
3400fn focus_matches_path(stored_path: &str, focus: &str) -> bool {
3401 if focus.is_empty() {
3402 return false;
3403 }
3404 if stored_path == focus {
3405 return true;
3406 }
3407 stored_path.len() > focus.len()
3408 && stored_path.ends_with(focus)
3409 && stored_path.as_bytes()[stored_path.len() - focus.len() - 1] == b'/'
3410}
3411
3412/// Build top-N caller and callee lists for each file.
3413///
3414/// Given a list of weighted directed edges `(src, dst, weight)` over `n`
3415/// nodes, returns `(callers[i], callees[i])` for each node `i`, where each
3416/// list contains the top-[`MAX_NEIGHBORS`] adjacent nodes sorted by descending
3417/// edge weight.
3418///
3419/// Exposed as `pub` so that integration tests can construct synthetic
3420/// [`RepoGraph`] instances for unit-testing the JSON rendering without going
3421/// through a full disk walk.
3422#[must_use]
3423pub fn build_neighbor_lists(n: usize, edges: &[(u32, u32, u32)]) -> (Vec<Vec<u32>>, Vec<Vec<u32>>) {
3424 let mut incoming: Vec<Vec<(u32, u32)>> = vec![vec![]; n];
3425 let mut outgoing: Vec<Vec<(u32, u32)>> = vec![vec![]; n];
3426
3427 for &(src, dst, w) in edges {
3428 let (s, d) = (src as usize, dst as usize);
3429 if s < n && d < n {
3430 incoming[d].push((src, w));
3431 outgoing[s].push((dst, w));
3432 }
3433 }
3434
3435 // Sort by weight descending, keep top N
3436 let trim = |lists: &mut [Vec<(u32, u32)>]| -> Vec<Vec<u32>> {
3437 lists
3438 .iter_mut()
3439 .map(|list| {
3440 list.sort_by_key(|b| std::cmp::Reverse(b.1));
3441 list.iter()
3442 .take(MAX_NEIGHBORS)
3443 .map(|(idx, _)| *idx)
3444 .collect()
3445 })
3446 .collect()
3447 };
3448
3449 (trim(&mut incoming), trim(&mut outgoing))
3450}
3451
3452// ── Rendering ────────────────────────────────────────────────────────
3453
3454/// Render a budget-constrained overview of the repository.
3455///
3456/// Files are sorted by `PageRank` (or topic-sensitive rank if `focus` is
3457/// `Some`). Output uses four tiers of decreasing detail:
3458///
3459/// - **Tier 0** (top 10%): full path, rank, callers/callees, signatures with scopes
3460/// - **Tier 1** (next 20%): full path, rank, signatures
3461/// - **Tier 2** (next 40%): full path, rank, definition names and kinds
3462/// - **Tier 3** (bottom 30%): file path only
3463///
3464/// Stops accumulating output when the estimated token count exceeds
3465/// `max_tokens`.
3466#[must_use]
3467pub fn render(graph: &RepoGraph, max_tokens: usize, focus: Option<usize>) -> String {
3468 let n = graph.files.len();
3469 if n == 0 {
3470 return String::new();
3471 }
3472
3473 // Compute ranks (recompute topic-sensitive if focus is given)
3474 let ranks = if focus.is_some() {
3475 pagerank(n, &graph.edges, focus)
3476 } else {
3477 graph.base_ranks.clone()
3478 };
3479
3480 // Sort file indices by rank descending
3481 let mut sorted: Vec<usize> = (0..n).collect();
3482 sorted.sort_by(|&a, &b| ranks[b].total_cmp(&ranks[a]));
3483
3484 let mut output = String::new();
3485 let mut used_tokens = 0;
3486 let max_chars = max_tokens * CHARS_PER_TOKEN;
3487
3488 for (rank_pos, &file_idx) in sorted.iter().enumerate() {
3489 if used_tokens >= max_tokens {
3490 break;
3491 }
3492
3493 let file = &graph.files[file_idx];
3494 let score = ranks[file_idx];
3495 #[expect(clippy::cast_precision_loss, reason = "file counts fit in f32")]
3496 let percentile = (rank_pos as f32) / (n as f32);
3497
3498 let section = if percentile < 0.1 {
3499 render_tier0(graph, file_idx, file, score)
3500 } else if percentile < 0.3 {
3501 render_tier1(file, score)
3502 } else if percentile < 0.7 {
3503 render_tier2(file, score)
3504 } else {
3505 render_tier3(file)
3506 };
3507
3508 let section_chars = section.len();
3509 if used_tokens > 0 && used_tokens + section_chars / CHARS_PER_TOKEN > max_tokens {
3510 // Would exceed budget — try to fit at least the path
3511 let path_line = format!("{}\n", file.path);
3512 let path_tokens = path_line.len() / CHARS_PER_TOKEN;
3513 if used_tokens + path_tokens <= max_tokens {
3514 output.push_str(&path_line);
3515 }
3516 break;
3517 }
3518
3519 output.push_str(§ion);
3520 used_tokens = output.len().min(max_chars) / CHARS_PER_TOKEN;
3521 }
3522
3523 output
3524}
3525
3526/// Render tier 0: full detail with callers, callees, and signatures.
3527fn render_tier0(graph: &RepoGraph, file_idx: usize, file: &FileNode, score: f32) -> String {
3528 let mut out = format!("## {} (rank: {score:.4})\n", file.path);
3529
3530 // Callers
3531 if file_idx < graph.callers.len() && !graph.callers[file_idx].is_empty() {
3532 let _ = write!(out, " called by: ");
3533 let names: Vec<&str> = graph.callers[file_idx]
3534 .iter()
3535 .filter_map(|&idx| graph.files.get(idx as usize).map(|f| f.path.as_str()))
3536 .collect();
3537 let _ = writeln!(out, "{}", names.join(", "));
3538 }
3539
3540 // Callees
3541 if file_idx < graph.callees.len() && !graph.callees[file_idx].is_empty() {
3542 let _ = write!(out, " calls: ");
3543 let names: Vec<&str> = graph.callees[file_idx]
3544 .iter()
3545 .filter_map(|&idx| graph.files.get(idx as usize).map(|f| f.path.as_str()))
3546 .collect();
3547 let _ = writeln!(out, "{}", names.join(", "));
3548 }
3549
3550 // Definitions with scope and signature
3551 for def in &file.defs {
3552 let scope_prefix = if def.scope.is_empty() {
3553 String::new()
3554 } else {
3555 format!("{} > ", def.scope)
3556 };
3557 if let Some(sig) = &def.signature {
3558 let _ = writeln!(out, " {scope_prefix}{} {sig}", def.kind);
3559 } else {
3560 let _ = writeln!(out, " {scope_prefix}{} {}", def.kind, def.name);
3561 }
3562 }
3563 let _ = writeln!(out);
3564 out
3565}
3566
3567/// Render tier 1: file path, rank, and signatures.
3568fn render_tier1(file: &FileNode, score: f32) -> String {
3569 let mut out = format!("## {} (rank: {score:.4})\n", file.path);
3570 for def in &file.defs {
3571 if let Some(sig) = &def.signature {
3572 let _ = writeln!(out, " {sig}");
3573 } else {
3574 let _ = writeln!(out, " {} {}", def.kind, def.name);
3575 }
3576 }
3577 let _ = writeln!(out);
3578 out
3579}
3580
3581/// Render tier 2: file path, rank, and definition names/kinds.
3582fn render_tier2(file: &FileNode, score: f32) -> String {
3583 let mut out = format!("{} (rank: {score:.4})", file.path);
3584 if !file.defs.is_empty() {
3585 let names: Vec<String> = file
3586 .defs
3587 .iter()
3588 .map(|d| format!("{}:{}", d.kind, d.name))
3589 .collect();
3590 let _ = write!(out, " -- {}", names.join(", "));
3591 }
3592 let _ = writeln!(out);
3593 out
3594}
3595
3596/// Render tier 3: file path only.
3597fn render_tier3(file: &FileNode) -> String {
3598 format!("{}\n", file.path)
3599}
3600
3601// ── JSON rendering ───────────────────────────────────────────────────
3602
3603/// Build the `lsp_location` for a file itself (line 0).
3604fn file_lsp_location(path: &str) -> RepoMapLspLocation {
3605 RepoMapLspLocation {
3606 file_path: if path.starts_with("./") || path.starts_with('/') {
3607 path.to_string()
3608 } else {
3609 format!("./{path}")
3610 },
3611 start_line: 0,
3612 start_character: 0,
3613 end_line: 0,
3614 end_character: 0,
3615 }
3616}
3617
3618/// Infer `ContentKind` from a file path's extension.
3619fn content_kind_for_path(path: &str) -> ContentKind {
3620 let ext = std::path::Path::new(path)
3621 .extension()
3622 .and_then(|e| e.to_str())
3623 .unwrap_or("");
3624 ContentKind::from_extension(ext)
3625}
3626
3627/// Minimum byte envelope reserved for each included file.
3628///
3629/// Even a file with zero symbols takes JSON overhead for path, rank, arrays,
3630/// etc. Calibrated against actual serde_json output for an empty `RepoMapFile`:
3631/// `{"lsp_location":{"file_path":"./src/file_N.rs","start_line":0,"start_character":0,`
3632/// `"end_line":0,"end_character":0},"rank":0.1234,"content_kind":"code",`
3633/// `"calls":[],"symbols":[],"truncated_symbols":0,"truncated_calls":0}` ≈ 250 bytes.
3634///
3635/// This floor prevents the budget allocator from giving a file so little space
3636/// that it can emit no envelope at all.
3637const FILE_ENVELOPE_MIN_BYTES: usize = 250;
3638
3639/// Minimum useful payload for an admitted file: envelope plus room for at
3640/// least 2-3 typical-sized symbols. Files whose fair share cannot meet this
3641/// floor are excluded entirely (Fix A, 4.0.2). Without this guard, low-rank
3642/// tail files consume budget on envelopes that contain no symbols or calls,
3643/// crowding out content for the top files.
3644const FILE_MIN_USEFUL_BYTES: usize = 600;
3645
3646/// Fraction of each file's per-file budget reserved for outgoing-call edges
3647/// after the envelope is paid. The remaining (1 - this) fraction goes to
3648/// symbols. Symbol leftover flows into calls; call leftover flows to the
3649/// next file. (Fix C, 4.0.2 — without a reserve, the symbol loop saturates
3650/// the per-file budget and calls always come up empty.)
3651const CALLS_BUDGET_FRACTION: f64 = 0.30;
3652
3653/// Maximum fraction of the total budget that a single file may claim.
3654///
3655/// Without this cap a single very-high-rank file (e.g. `lib.rs`) could
3656/// consume the entire budget, leaving all other files empty.
3657const MAX_FILE_SHARE: f64 = 0.40;
3658
3659/// AST kind priority for orientation-style symbol ordering. Higher = surface
3660/// earlier. Used when def-level PageRank is degenerate (most ranks near zero)
3661/// to fall back on structural signal rather than noise.
3662///
3663/// The intuition: a reader orienting in a codebase wants to see the file's
3664/// *shape* before its *behaviors*. Types declare shape; functions declare
3665/// behavior; fields and constants are internal detail. This ordering matches
3666/// how humans read code top-down. (Fix B, 4.0.2.)
3667fn ast_kind_priority(kind: &str) -> u32 {
3668 match kind {
3669 // Tier 3: shape — what THIS file is
3670 "trait_item" | "interface" | "trait" => 30,
3671 "struct_item" | "struct" | "class_definition" | "class" => 29,
3672 "enum_item" | "enum" => 28,
3673 "type_item" | "type_alias_declaration" | "type_alias" => 27,
3674 "mod_item" | "module" | "namespace" => 26,
3675 // Tier 2: behavior — what THIS file does
3676 "function_item" | "function_definition" | "function" | "method_definition" => 20,
3677 "impl_item" | "impl" => 19,
3678 // Tier 1: declarations
3679 "const_item" | "const_declaration" | "const" => 10,
3680 "static_item" | "static" => 9,
3681 // Tier 0: internals (fields, variables, parameters)
3682 _ => 0,
3683 }
3684}
3685
3686/// Effective AST priority with corpus-relative rank promotion (4.0.4).
3687///
3688/// Preserves the 4.0.2 AST-priority ordering by default (types first,
3689/// then functions, then fields). When a def's PageRank significantly
3690/// exceeds the corpus median, promotes it up one or two tiers so that
3691/// load-bearing defs surface alongside their declared-tier neighbors.
3692///
3693/// Thresholds are corpus-median multiples (self-calibrating):
3694/// - rank > 4× median → +1 tier (e.g., hot function joins type tier)
3695/// - rank > 16× median → +2 tiers (extremely hot def)
3696/// - otherwise → declared tier preserved
3697///
3698/// On degenerate (flat) rank distributions the median equals the floor,
3699/// nothing crosses threshold, and 4.0.2 AST-priority ordering is fully
3700/// preserved. On informative distributions (post-4.0.3 enrichment),
3701/// hot defs surface proportionally.
3702fn effective_priority(kind: &str, def_rank: f32, promo_1: f32, promo_2: f32) -> u32 {
3703 let base = ast_kind_priority(kind);
3704 // Accumulate promotion tiers as a plain integer to satisfy clippy's
3705 // bool_to_int_with_if lint while preserving branch clarity.
3706 let promo_tiers: u32 = u32::from(def_rank > promo_1) + u32::from(def_rank > promo_2);
3707 // Tier spacing matches ast_kind_priority's 10-unit gaps.
3708 base + promo_tiers * 10
3709}
3710
3711/// Estimate the serialised JSON byte cost of one `RepoMapSymbol`.
3712///
3713/// Calibrated against actual serde_json output. A `RepoMapSymbol` serialises to
3714/// approximately:
3715/// `{"name":"<N>","kind":<K>,"lsp_location":{"file_path":"<P>","start_line":0,`
3716/// `"start_character":0,"end_line":0,"end_character":0},"rank":<R>}`
3717///
3718/// That is ~165 bytes of overhead (braces, keys, fixed-width integers, rank)
3719/// plus the name length and file_path length. We pass the path length
3720/// separately because the path is the same for all symbols in one file.
3721fn estimate_symbol_bytes(name: &str) -> usize {
3722 // 165 bytes overhead + name length.
3723 // The file_path is not included here because it is part of the
3724 // envelope cost accounted separately.
3725 165 + name.len()
3726}
3727
3728/// Estimate the serialised JSON byte cost of one `RepoMapCall`.
3729///
3730/// Each call entry: `{"lsp_location":{"file_path":"<P>","start_line":0,`
3731/// `"start_character":0,"end_line":0,"end_character":0},"rank":<R>}`
3732/// ≈ 120 bytes overhead + path length.
3733fn estimate_call_bytes(target_path: &str) -> usize {
3734 120 + target_path.len()
3735}
3736
3737/// Render a `PageRank`-weighted JSON map with token-budget allocation (4.0.1).
3738///
3739/// # Algorithm
3740///
3741/// **Step 1 — File-share allocation.** Each eligible file receives a byte
3742/// budget proportional to its `base_rank`. The share is capped at 40% of
3743/// `budget_total_bytes` and floored at [`FILE_ENVELOPE_MIN_BYTES`] (200 B).
3744/// Files are included in rank order until the cumulative allocation would
3745/// exceed the total budget.
3746///
3747/// **Step 2 — Per-file symbol fill.** For each included file, symbols are
3748/// walked in def-rank descending order. Inclusion continues until either (a)
3749/// the file's budget share is exhausted (with carry-over of leftover bytes to
3750/// the next file) or (b) a logarithmic attenuation cutoff fires: symbol at
3751/// position `i` (0-based) is included only if its rank ≥ `top_rank /
3752/// (1 + ln(i + 1))`. The same algorithm fills `calls[]` in target-file
3753/// base-rank order. `truncated_symbols` and `truncated_calls` track the
3754/// count of omitted entries.
3755///
3756/// **Step 3 — Response telemetry.** The response includes `estimated_bytes`
3757/// (actual returned content size), `budget_bytes` (`token_budget * 4`),
3758/// and `budget_exhausted` (`total_files > files.len()`).
3759///
3760/// # Arguments
3761///
3762/// - `graph` — the built dependency graph.
3763/// - `token_budget` — caller-specified token budget (× 4 = byte budget).
3764/// - `focus` — optional file index for topic-sensitive `PageRank`.
3765/// - `include_metadata` — when `false` (default), Meta-classified files
3766/// are excluded before ranking.
3767#[must_use]
3768#[expect(
3769 clippy::cast_precision_loss,
3770 reason = "rank sums and counts are small f32/f64; precision loss is acceptable"
3771)]
3772#[expect(
3773 clippy::too_many_lines,
3774 reason = "the three-step allocation algorithm (file-share → symbol-fill → calls-fill) \
3775 is sequential and share state; splitting into helpers would require passing \
3776 mutable slices across three boundaries with no clarity gain"
3777)]
3778pub fn render_json_budgeted(
3779 graph: &RepoGraph,
3780 token_budget: usize,
3781 focus: Option<usize>,
3782 include_metadata: bool,
3783) -> GetRepoMapResponse {
3784 let n = graph.files.len();
3785 if n == 0 {
3786 let budget_bytes = token_budget * CHARS_PER_TOKEN;
3787 return GetRepoMapResponse {
3788 files: vec![],
3789 total_files: 0,
3790 estimated_bytes: 0,
3791 budget_bytes,
3792 budget_exhausted: false,
3793 capped: false,
3794 };
3795 }
3796
3797 let budget_total_bytes = token_budget * CHARS_PER_TOKEN;
3798
3799 // Recompute topic-sensitive ranks if focus is given.
3800 let ranks = if focus.is_some() {
3801 pagerank(n, &graph.edges, focus)
3802 } else {
3803 graph.base_ranks.clone()
3804 };
3805
3806 // Sort all file indices by rank descending.
3807 let mut sorted: Vec<usize> = (0..n).collect();
3808 sorted.sort_by(|&a, &b| ranks[b].total_cmp(&ranks[a]));
3809
3810 // Apply metadata exclusion filter.
3811 let eligible: Vec<usize> = if include_metadata {
3812 sorted
3813 } else {
3814 sorted
3815 .into_iter()
3816 .filter(|&idx| {
3817 let kind = content_kind_for_path(&graph.files[idx].path);
3818 kind != ContentKind::Meta
3819 })
3820 .collect()
3821 };
3822
3823 let total_files = eligible.len();
3824
3825 // ── Corpus-median def-rank thresholds for tier promotion (4.0.4) ────────
3826 //
3827 // Compute once per call (corpus-wide, not per-file) so the threshold is
3828 // self-calibrating: flat distributions (all ranks equal) set median = floor
3829 // and nothing crosses threshold; informative distributions see proportional
3830 // promotion. Using corpus-wide median ensures a hot function in one file is
3831 // judged against the entire corpus, not just its local file peers.
3832 // Use the 75th percentile of nonzero def-ranks as the corpus reference value
3833 // for tier promotion (rather than the 50th percentile / median). The 75th
3834 // percentile is more robust: on a flat distribution most defs cluster near the
3835 // floor, so the 75th percentile is only marginally above the floor (making the
3836 // 4× threshold very selective). On an informative distribution (post-4.0.3
3837 // call-edge enrichment) the 75th percentile is meaningfully above the floor,
3838 // so the same 4× multiplier captures genuinely hot defs without falsely
3839 // promoting slightly-above-floor helpers.
3840 //
3841 // The 50th percentile (lower median) was rejected because on a 10-def corpus
3842 // with max/min ratio 5× the median equals the floor, causing the 4× threshold
3843 // to fire on defs that are only 5× above floor (a low-variance corpus). The
3844 // 75th percentile corrects this without requiring hand-tuned per-corpus magic
3845 // numbers.
3846 let corpus_reference_rank: f32 = {
3847 let mut nonzero: Vec<f32> = graph
3848 .def_ranks
3849 .iter()
3850 .copied()
3851 .filter(|r| *r > 0.0)
3852 .collect();
3853 if nonzero.is_empty() {
3854 0.0
3855 } else {
3856 nonzero.sort_unstable_by(f32::total_cmp);
3857 let n = nonzero.len();
3858 // 75th percentile index: floor(0.75 * (n - 1))
3859 let idx = (3 * (n - 1)) / 4;
3860 nonzero[idx]
3861 }
3862 };
3863 let promo_1_threshold = corpus_reference_rank * 4.0; // +1 tier
3864 let promo_2_threshold = corpus_reference_rank * 16.0; // +2 tiers
3865
3866 // ── Step 1: File-share allocation ────────────────────────────────
3867
3868 // Greedily determine which files fit within the budget, computing each
3869 // file's share as it is added. We must run a two-pass approach:
3870 // pass A: determine which files are included (cumulative sum check),
3871 // pass B: fill symbols/calls using final per-file allocations.
3872 //
3873 // The "included" decision is based on the running cumulative sum so that
3874 // the leftover redistribution in step 2 can carry forward correctly.
3875
3876 // Floor-first admission (Fix A, 4.0.2):
3877 //
3878 // Cap admitted file count so each gets at least FILE_MIN_USEFUL_BYTES.
3879 // Below this threshold the response would carry envelopes that contain
3880 // no symbols or calls — pure overhead, no information. Concentrating
3881 // the budget on fewer files with real content is strictly better for
3882 // orientation than dropping envelope sentinels for many files.
3883 let max_admissible = budget_total_bytes / FILE_MIN_USEFUL_BYTES;
3884 let admit_count = eligible.len().min(max_admissible.max(1));
3885
3886 let budget_f64 = budget_total_bytes as f64;
3887
3888 // Pre-compute rank sum across ADMITTED files only (top-N by rank). f64
3889 // to avoid precision loss when summing many small f32 values.
3890 let admitted_rank_sum: f64 = eligible
3891 .iter()
3892 .take(admit_count)
3893 .map(|&idx| f64::from(ranks[idx]))
3894 .sum();
3895 let admitted_rank_sum = if admitted_rank_sum > 0.0 {
3896 admitted_rank_sum
3897 } else {
3898 1.0
3899 };
3900
3901 // Compute per-file budgets. Each admitted file gets at least
3902 // FILE_MIN_USEFUL_BYTES; the proportional-to-rank share is applied on
3903 // top of the floor and capped at MAX_FILE_SHARE.
3904 let mut included_indices: Vec<usize> = Vec::new(); // indices into `eligible`
3905 let mut file_budgets: Vec<usize> = Vec::new();
3906 let mut cumulative_budget: usize = 0;
3907
3908 for (i, &file_idx) in eligible.iter().take(admit_count).enumerate() {
3909 let file_rank = f64::from(ranks[file_idx]);
3910 let raw_share = budget_f64 * file_rank / admitted_rank_sum;
3911 let capped = raw_share.min(budget_f64 * MAX_FILE_SHARE);
3912 // `capped` is non-negative and bounded by budget_f64 (a usize).
3913 #[expect(
3914 clippy::cast_possible_truncation,
3915 clippy::cast_sign_loss,
3916 reason = "capped is non-negative and bounded by budget_total_bytes (a usize)"
3917 )]
3918 let budget_i = (capped as usize).max(FILE_MIN_USEFUL_BYTES);
3919
3920 if cumulative_budget + budget_i > budget_total_bytes && !included_indices.is_empty() {
3921 break;
3922 }
3923 cumulative_budget += budget_i;
3924 included_indices.push(i);
3925 file_budgets.push(budget_i);
3926 }
3927
3928 // ── Step 2: Per-file symbol fill ─────────────────────────────────
3929
3930 let mut result_files: Vec<RepoMapFile> = Vec::with_capacity(included_indices.len());
3931 let mut leftover: usize = 0; // unused bytes carried from previous file
3932
3933 for (slot, &eligible_i) in included_indices.iter().enumerate() {
3934 let file_idx = eligible[eligible_i];
3935 let file = &graph.files[file_idx];
3936 let file_rank = ranks[file_idx];
3937 let file_path_lsp = file_lsp_location(&file.path);
3938
3939 let budget_in = file_budgets[slot] + leftover;
3940
3941 // Reserve a fraction of the post-envelope budget for outgoing calls
3942 // (Fix C, 4.0.2). Without this guard the symbol loop saturates
3943 // `budget_in` and the calls loop always trips its byte-check.
3944 // Symbol leftover flows into calls; call leftover flows to the
3945 // next file via the outer `leftover` variable.
3946 let post_envelope = budget_in.saturating_sub(FILE_ENVELOPE_MIN_BYTES);
3947 #[expect(
3948 clippy::cast_possible_truncation,
3949 clippy::cast_sign_loss,
3950 reason = "post_envelope * fraction is bounded by post_envelope (a usize)"
3951 )]
3952 let calls_reserve = (post_envelope as f64 * CALLS_BUDGET_FRACTION) as usize;
3953 let symbols_budget = FILE_ENVELOPE_MIN_BYTES + post_envelope.saturating_sub(calls_reserve);
3954 let mut used: usize = FILE_ENVELOPE_MIN_BYTES; // envelope cost
3955
3956 // ── Symbols ──────────────────────────────────────────────────
3957 // Retrieve def-level ranks for this file via the offset table.
3958 let def_count = file.defs.len();
3959 let def_offset = if file_idx < graph.def_offsets.len() {
3960 graph.def_offsets[file_idx]
3961 } else {
3962 0
3963 };
3964
3965 // Build (def_idx, rank, kind_priority, start_byte) tuples. We sort
3966 // by a composite key: AST kind priority (descending) — putting types
3967 // before functions before fields — then by def_rank (descending)
3968 // within each tier. This is Fix B (4.0.2): the def_rank distribution
3969 // is often degenerate (most defs share near-zero rank because the
3970 // call-edge extractor doesn't capture every dispatch), so we use
3971 // structural signal as the primary ordering and def_rank as the
3972 // within-tier tiebreaker. When def_rank IS informative, it dominates
3973 // *within* its kind tier and recovers the original behavior; the AST
3974 // signal only shifts ordering *between* tiers.
3975 let mut def_rank_pairs: Vec<(usize, f32, u32, u32)> = (0..def_count)
3976 .map(|di| {
3977 let flat = def_offset + di;
3978 let r = graph.def_ranks.get(flat).copied().unwrap_or(0.0);
3979 // Store the ORIGINAL ast_kind_priority in the tuple (used by the
3980 // per-tier attenuation loop below). The sort comparator uses
3981 // effective_priority (which may be higher due to 4.0.4 promotion)
3982 // to reorder hot defs ahead of cold type-tier defs, while the
3983 // attenuation tier tracker continues to use the original AST tier
3984 // so the existing per-tier cutoff behaviour is preserved.
3985 let kind_prio = ast_kind_priority(&file.defs[di].kind);
3986 let decl_order = file.defs[di].start_byte;
3987 (di, r, kind_prio, decl_order)
3988 })
3989 .collect();
3990 def_rank_pairs.sort_unstable_by(|a, b| {
3991 // Primary: effective priority (4.0.4: AST kind + corpus-rank promotion) descending.
3992 // Hot defs that exceed corpus-median thresholds are promoted above their
3993 // declared tier so they surface before cold type-tier defs.
3994 let eff_a = effective_priority(
3995 &file.defs[a.0].kind,
3996 a.1,
3997 promo_1_threshold,
3998 promo_2_threshold,
3999 );
4000 let eff_b = effective_priority(
4001 &file.defs[b.0].kind,
4002 b.1,
4003 promo_1_threshold,
4004 promo_2_threshold,
4005 );
4006 eff_b
4007 .cmp(&eff_a)
4008 // Secondary: def_rank descending within tier.
4009 .then_with(|| b.1.total_cmp(&a.1))
4010 // Tertiary: earlier declaration order (stable, deterministic).
4011 .then_with(|| a.3.cmp(&b.3))
4012 });
4013
4014 let top_def_rank = def_rank_pairs.first().map(|&(_, r, _, _)| r).unwrap_or(0.0);
4015
4016 let mut symbols: Vec<RepoMapSymbol> = Vec::new();
4017 let mut truncated_symbols: usize = 0;
4018
4019 // Track per-tier position for the attenuation cutoff. When AST kind
4020 // priority changes (we've moved from types to functions, say), reset
4021 // the position so the attenuation curve restarts. Otherwise a
4022 // structurally-equivalent-but-later tier would be unfairly cut.
4023 let mut tier_pos: usize = 0;
4024 let mut current_tier: Option<u32> = None;
4025 let mut tier_top_rank: f32 = top_def_rank;
4026
4027 for (pos, &(di, def_r, kind_prio, _)) in def_rank_pairs.iter().enumerate() {
4028 // Reset attenuation at tier boundaries.
4029 if current_tier != Some(kind_prio) {
4030 current_tier = Some(kind_prio);
4031 tier_pos = 0;
4032 tier_top_rank = def_r;
4033 }
4034
4035 // Logarithmic attenuation cutoff, relative to the tier's top rank.
4036 let cutoff = if tier_top_rank > 0.0 {
4037 tier_top_rank / (1.0 + (tier_pos as f32 + 1.0).ln())
4038 } else {
4039 0.0
4040 };
4041 if def_r < cutoff {
4042 // Attenuation cuts the rest of THIS tier; we don't stop
4043 // entirely because the next tier may still have useful
4044 // content within its own attenuation curve. Skip this def.
4045 truncated_symbols += 1;
4046 tier_pos += 1;
4047 continue;
4048 }
4049
4050 let def = &file.defs[di];
4051 let sym_bytes = estimate_symbol_bytes(&def.name);
4052 // Use the reserved symbols sub-budget (Fix C) so calls aren't
4053 // starved when symbols would otherwise saturate budget_in.
4054 if used + sym_bytes > symbols_budget {
4055 truncated_symbols += def_rank_pairs.len() - pos;
4056 break;
4057 }
4058
4059 let kind = crate::languages::lsp_symbol_kind_for_node_kind(&def.kind);
4060 let line_0 = def.start_line.saturating_sub(1) as usize;
4061 symbols.push(RepoMapSymbol {
4062 name: def.name.clone(),
4063 kind,
4064 lsp_location: RepoMapLspLocation {
4065 file_path: file_path_lsp.file_path.clone(),
4066 start_line: line_0,
4067 start_character: 0,
4068 end_line: line_0,
4069 end_character: 0,
4070 },
4071 rank: def_r,
4072 });
4073 used += sym_bytes;
4074 tier_pos += 1;
4075 }
4076
4077 // ── Calls ─────────────────────────────────────────────────────
4078 // Gather outgoing callees sorted by target file base_rank descending.
4079 let callee_indices: Vec<usize> = if file_idx < graph.callees.len() {
4080 let mut callees: Vec<(usize, f32)> = graph.callees[file_idx]
4081 .iter()
4082 .filter_map(|&ci| {
4083 let ci = ci as usize;
4084 graph.files.get(ci).map(|_| {
4085 let r = graph.base_ranks.get(ci).copied().unwrap_or(0.0);
4086 (ci, r)
4087 })
4088 })
4089 .collect();
4090 callees.sort_unstable_by(|a, b| b.1.total_cmp(&a.1));
4091 callees.into_iter().map(|(ci, _)| ci).collect()
4092 } else {
4093 vec![]
4094 };
4095
4096 let call_total = callee_indices.len();
4097 let top_call_rank = callee_indices
4098 .first()
4099 .and_then(|&ci| graph.base_ranks.get(ci))
4100 .copied()
4101 .unwrap_or(0.0);
4102
4103 let mut calls: Vec<RepoMapCall> = Vec::new();
4104 let mut truncated_calls: usize = 0;
4105
4106 for (pos, &ci) in callee_indices.iter().enumerate() {
4107 let callee_rank = graph.base_ranks.get(ci).copied().unwrap_or(0.0);
4108
4109 // Logarithmic attenuation cutoff on target rank.
4110 let cutoff = if top_call_rank > 0.0 {
4111 top_call_rank / (1.0 + (pos as f32 + 1.0).ln())
4112 } else {
4113 0.0
4114 };
4115 if callee_rank < cutoff {
4116 truncated_calls += call_total - pos;
4117 break;
4118 }
4119
4120 let callee_path = &graph.files[ci].path;
4121 let call_bytes = estimate_call_bytes(callee_path);
4122 if used + call_bytes > budget_in {
4123 truncated_calls += call_total - pos;
4124 break;
4125 }
4126
4127 calls.push(RepoMapCall {
4128 lsp_location: file_lsp_location(callee_path),
4129 rank: callee_rank,
4130 });
4131 used += call_bytes;
4132 }
4133
4134 // Carry unused bytes forward to the next file.
4135 leftover = budget_in.saturating_sub(used);
4136
4137 result_files.push(RepoMapFile {
4138 lsp_location: file_path_lsp,
4139 rank: file_rank,
4140 content_kind: content_kind_tag(content_kind_for_path(&file.path)),
4141 calls,
4142 symbols,
4143 truncated_symbols,
4144 truncated_calls,
4145 });
4146 }
4147
4148 let estimated_bytes = serde_json::to_string(&result_files)
4149 .map(|s| s.len())
4150 .unwrap_or(0);
4151
4152 let budget_exhausted = total_files > result_files.len();
4153
4154 GetRepoMapResponse {
4155 files: result_files,
4156 total_files,
4157 estimated_bytes,
4158 budget_bytes: budget_total_bytes,
4159 budget_exhausted,
4160 capped: budget_exhausted,
4161 }
4162}
4163
4164/// Render a `PageRank`-sorted JSON map of the repository (4.0.0 compatibility shim).
4165///
4166/// This function wraps [`render_json_budgeted`] with a synthetic token budget
4167/// derived from `max_files * 2000` (a generous per-file allowance). It exists
4168/// to keep the existing D1/D2 unit tests compiling without change; the MCP
4169/// layer calls [`render_json_budgeted`] directly in 4.0.1.
4170///
4171/// The `capped` field in the response reflects whether the budget was
4172/// exhausted before all `eligible` files were included, which is equivalent
4173/// to the previous `total_files > max_files` check.
4174///
4175/// When `include_metadata` is `false` (default), files whose extension
4176/// classifies as [`ContentKind::Meta`] are excluded before ranking.
4177#[must_use]
4178pub fn render_json(
4179 graph: &RepoGraph,
4180 max_files: usize,
4181 focus: Option<usize>,
4182 include_metadata: bool,
4183) -> GetRepoMapResponse {
4184 // Synthesise a generous token budget: 2000 tokens per requested file.
4185 // This ensures the existing D1/D2 tests (which pass small max_files values
4186 // like 3, 5, 50) see the same file-count behaviour they expect. The test
4187 // assertions check file counts, not byte sizes, so the exact budget value
4188 // only matters for ensuring enough headroom.
4189 let token_budget = max_files.saturating_mul(2000);
4190 render_json_budgeted(graph, token_budget, focus, include_metadata)
4191}
4192
4193// ── Tests ────────────────────────────────────────────────────────────
4194
4195#[cfg(test)]
4196mod tests {
4197 use super::*;
4198
4199 #[test]
4200 fn test_pagerank_simple() {
4201 // 3-node graph: 0 -> 1 -> 2, 2 -> 0 (cycle)
4202 let edges = vec![(0, 1, 1), (1, 2, 1), (2, 0, 1)];
4203 let ranks = pagerank(3, &edges, None);
4204
4205 // All nodes in a symmetric cycle should have equal rank
4206 assert_eq!(ranks.len(), 3);
4207 let sum: f32 = ranks.iter().sum();
4208 assert!(
4209 (sum - 1.0).abs() < 0.01,
4210 "ranks should sum to ~1.0, got {sum}"
4211 );
4212
4213 // In a perfect cycle, all ranks should be approximately equal
4214 let expected = 1.0 / 3.0;
4215 for (i, &r) in ranks.iter().enumerate() {
4216 assert!(
4217 (r - expected).abs() < 0.05,
4218 "rank[{i}] = {r}, expected ~{expected}"
4219 );
4220 }
4221 }
4222
4223 #[test]
4224 fn test_pagerank_star() {
4225 // Star graph: 0,1,2 all point to 3
4226 let edges = vec![(0, 3, 1), (1, 3, 1), (2, 3, 1)];
4227 let ranks = pagerank(4, &edges, None);
4228
4229 assert_eq!(ranks.len(), 4);
4230 // Node 3 should have the highest rank
4231 let max_idx = ranks
4232 .iter()
4233 .enumerate()
4234 .max_by(|a, b| a.1.total_cmp(b.1))
4235 .unwrap()
4236 .0;
4237 assert_eq!(max_idx, 3, "node 3 should have highest rank");
4238 assert!(
4239 ranks[3] > ranks[0],
4240 "rank[3]={} should be > rank[0]={}",
4241 ranks[3],
4242 ranks[0]
4243 );
4244 }
4245
4246 #[test]
4247 fn test_pagerank_topic_sensitive() {
4248 // 10-node chain: 0 -> 1 -> ... -> 9.
4249 //
4250 // With PERSONALIZATION_ALPHA = 0.15 and n = 10, the uniform share per
4251 // node is 1/10 = 0.10. The focus node (0) gets 0.15 teleportation
4252 // mass vs 0.10 uniform, so focused rank[0] > uniform rank[0] holds.
4253 //
4254 // The 3-node chain used previously broke when alpha was reduced from
4255 // 0.70 to 0.15 because 0.15 < 1/3 = 0.33 for n=3 — the focus node
4256 // received *less* teleportation than its uniform share, inverting the
4257 // expected direction. Using n=10 avoids this edge case while still
4258 // testing the personalization effect.
4259 let n = 10_usize;
4260 #[expect(clippy::cast_possible_truncation, reason = "test: n << u32::MAX")]
4261 let edges: Vec<(u32, u32, u32)> = (0..(n - 1))
4262 .map(|i| (i as u32, (i + 1) as u32, 1_u32))
4263 .collect();
4264 let uniform_ranks = pagerank(n, &edges, None);
4265 let biased_ranks = pagerank(n, &edges, Some(0));
4266
4267 // With focus on node 0, it should get a higher rank than uniform
4268 // because PERSONALIZATION_ALPHA (0.15) > 1/n (0.10) for n=10.
4269 assert!(
4270 biased_ranks[0] > uniform_ranks[0],
4271 "focused rank[0]={} should be > uniform rank[0]={}",
4272 biased_ranks[0],
4273 uniform_ranks[0]
4274 );
4275 }
4276
4277 // ── J1 tests — topic-sensitive PageRank soft personalization ─────────
4278
4279 /// J1 RED: `focus_file` PageRank must not collapse other-file ranks.
4280 ///
4281 /// Baseline (pre-4.0.5) concentrated 70% mass on the focus node, producing
4282 /// a degenerate Dirac delta: focus rank ≈ 0.703, all others ≈ 0.003.
4283 /// This test fails on the baseline and must pass after the fix.
4284 ///
4285 /// Invariant: with `PERSONALIZATION_ALPHA = 0.15`, focus node gets 0.15 of
4286 /// teleportation mass and each of the other (n-1) nodes gets 0.85/(n-1).
4287 /// On a star graph with n=10 nodes, the focus node rank must NOT be more
4288 /// than 40× the average non-focus rank. The 4.0.5 fix targets roughly
4289 /// 5-10× for a well-connected graph, so 40× is a conservative upper bound
4290 /// that the baseline (≈200×) fails.
4291 #[test]
4292 fn test_focus_file_topic_pagerank_preserves_rank_dispersion() {
4293 // Star graph: nodes 1..9 all point to node 0 (high natural rank).
4294 // Focus on node 1 (low natural rank) to test personalization effect.
4295 let n = 10_usize;
4296 #[expect(clippy::cast_possible_truncation, reason = "test: n << u32::MAX")]
4297 let edges: Vec<(u32, u32, u32)> = (1..n).map(|i| (i as u32, 0_u32, 1_u32)).collect();
4298
4299 let ranks_focused = pagerank(n, &edges, Some(1));
4300
4301 let focus_rank = ranks_focused[1];
4302 let sum_non_focus: f32 = ranks_focused
4303 .iter()
4304 .enumerate()
4305 .filter(|&(i, _)| i != 1)
4306 .map(|(_, &r)| r)
4307 .sum();
4308 let n_non_focus = (n - 1) as f32;
4309 let avg_non_focus = sum_non_focus / n_non_focus;
4310
4311 let dispersion_ratio = focus_rank / avg_non_focus;
4312
4313 eprintln!(
4314 "J1 dispersion: focus_rank={focus_rank:.6}, avg_non_focus={avg_non_focus:.6}, \
4315 ratio={dispersion_ratio:.2}× (must be <= 40×)"
4316 );
4317
4318 // With 0.15 personalization alpha the focus node's teleportation
4319 // advantage is modest; 40× is an upper bound the old 0.70 code violates.
4320 assert!(
4321 dispersion_ratio <= 40.0,
4322 "focus rank is {dispersion_ratio:.1}× avg non-focus rank (must be ≤ 40×); \
4323 pre-fix baseline was ~200× due to 70% concentration — I#16"
4324 );
4325
4326 // Ranks must still sum to ~1.
4327 let total: f32 = ranks_focused.iter().sum();
4328 assert!(
4329 (total - 1.0).abs() < 0.01,
4330 "ranks must sum to ≈1.0; got {total}"
4331 );
4332 }
4333
4334 /// J1 RED: focus node must have the highest rank (it still gets the bias),
4335 /// but non-focus nodes must NOT collapse to a flat floor.
4336 ///
4337 /// Concretely: the second-highest-ranked file must be ≥ 10% of the focus
4338 /// file's rank (neighborhood rebiasing, not winner-take-all).
4339 #[test]
4340 fn test_focus_file_topic_pagerank_does_not_collapse_other_files() {
4341 // Linear chain: 0 → 1 → 2 → ... → 9 (directed).
4342 // Focus on node 0. Without personalization, ranks decrease along the
4343 // chain. With soft personalization the non-focus nodes stay non-trivial.
4344 let n = 10_usize;
4345 #[expect(clippy::cast_possible_truncation, reason = "test: n << u32::MAX")]
4346 let edges: Vec<(u32, u32, u32)> = (0..(n - 1))
4347 .map(|i| (i as u32, (i + 1) as u32, 1_u32))
4348 .collect();
4349
4350 let ranks = pagerank(n, &edges, Some(0));
4351
4352 let focus_rank = ranks[0];
4353 // All non-focus ranks must be ≥ 10% of focus rank.
4354 for (i, &r) in ranks.iter().enumerate().skip(1) {
4355 assert!(
4356 r >= focus_rank * 0.10,
4357 "rank[{i}] = {r:.6} is < 10% of focus rank {focus_rank:.6}; \
4358 non-focus files must not collapse to near-zero (I#16)"
4359 );
4360 }
4361 }
4362
4363 // ── J2 tests — neighborhood count parity ─────────────────────────────
4364
4365 /// J2 RED: `render_json_budgeted` with `focus=Some(i)` must return at
4366 /// least 80% as many files as the unfocused call with the same budget.
4367 ///
4368 /// Baseline collapses the focused run to 1 dominant file + a flat tail
4369 /// that may still appear, but the intent is no budget waste on zero-signal
4370 /// entries. With soft personalization the focused call should return a
4371 /// similar file count to the unfocused call (within ±20%).
4372 #[test]
4373 fn test_focus_file_returns_neighborhood_not_just_focus() {
4374 // Build a 12-file star graph with meaningful rank variation.
4375 let n = 12_usize;
4376 #[expect(clippy::cast_possible_truncation, reason = "test: n << u32::MAX")]
4377 let edges: Vec<(u32, u32, u32)> = (1..n).map(|i| (i as u32, 0_u32, 1_u32)).collect();
4378 let base_ranks = pagerank(n, &edges, None);
4379 let (callers, callees) = build_neighbor_lists(n, &edges);
4380
4381 let file_nodes: Vec<FileNode> = (0..n)
4382 .map(|i| FileNode {
4383 path: format!("src/file_{i}.rs"),
4384 defs: vec![Definition {
4385 name: format!("func_{i}"),
4386 kind: "function_item".to_string(),
4387 start_line: 1,
4388 end_line: 5,
4389 scope: String::new(),
4390 signature: Some(format!("fn func_{i}() -> i32")),
4391 start_byte: 0,
4392 end_byte: 100,
4393 calls: vec![],
4394 }],
4395 imports: vec![],
4396 })
4397 .collect();
4398
4399 let graph = RepoGraph {
4400 files: file_nodes,
4401 edges,
4402 base_ranks,
4403 callers,
4404 callees,
4405 def_edges: vec![],
4406 def_ranks: vec![],
4407 def_callers: vec![],
4408 def_callees: vec![],
4409 def_offsets: vec![0],
4410 alpha: 0.5,
4411 };
4412
4413 let budget = 2000; // generous budget; all 12 files should fit
4414 let unfocused = render_json_budgeted(&graph, budget, None, false);
4415 let focused = render_json_budgeted(&graph, budget, Some(1), false);
4416
4417 let unfocused_n = unfocused.files.len();
4418 let focused_n = focused.files.len();
4419 #[expect(
4420 clippy::cast_possible_truncation,
4421 clippy::cast_sign_loss,
4422 reason = "unfocused_n is a file count (small, positive); f32 multiplication \
4423 by 0.80 and ceil produce a value in [0, n]; truncation to usize is safe"
4424 )]
4425 let min_expected = (unfocused_n as f32 * 0.80).ceil() as usize;
4426
4427 eprintln!(
4428 "J2 neighborhood: unfocused={unfocused_n} files, focused={focused_n} files \
4429 (need ≥ {min_expected})"
4430 );
4431
4432 assert!(
4433 focused_n >= min_expected,
4434 "focused call returned {focused_n} files; expected ≥ {min_expected} \
4435 (80% of unfocused {unfocused_n}); soft personalization must preserve \
4436 rank dispersion across files (I#16/J2)"
4437 );
4438 }
4439
4440 /// J2 RED: topic delta fingerprinting — focused run must reorder files
4441 /// relative to unfocused run (focus file surfaces near top), but both
4442 /// must contain similar total file counts.
4443 #[test]
4444 fn test_focus_delta_topic_fingerprinting_works() {
4445 // Bidirectional 8-file ring so all nodes are structurally equivalent.
4446 // Without focus all ranks are equal. With focus on node 3, node 3
4447 // must surface as the highest-ranked file.
4448 let n = 8_usize;
4449 #[expect(clippy::cast_possible_truncation, reason = "test: n << u32::MAX")]
4450 let edges: Vec<(u32, u32, u32)> = (0..n)
4451 .flat_map(|i| {
4452 let next = ((i + 1) % n) as u32;
4453 let curr = i as u32;
4454 [(curr, next, 1_u32), (next, curr, 1_u32)]
4455 })
4456 .collect();
4457
4458 let ranks_uniform = pagerank(n, &edges, None);
4459 let ranks_focused = pagerank(n, &edges, Some(3));
4460
4461 // Focus node must have highest rank.
4462 let top_idx = ranks_focused
4463 .iter()
4464 .enumerate()
4465 .max_by(|a, b| a.1.total_cmp(b.1))
4466 .map(|(i, _)| i)
4467 .unwrap();
4468
4469 assert_eq!(
4470 top_idx, 3,
4471 "with focus=Some(3), node 3 must have highest rank; top was {top_idx}"
4472 );
4473
4474 // Uniform baseline: all ranks should be approximately equal.
4475 let uniform_max = ranks_uniform
4476 .iter()
4477 .copied()
4478 .fold(f32::NEG_INFINITY, f32::max);
4479 let uniform_min = ranks_uniform.iter().copied().fold(f32::INFINITY, f32::min);
4480 assert!(
4481 (uniform_max - uniform_min).abs() < 0.01,
4482 "on a ring without focus all ranks should be ≈equal; max={uniform_max:.6} min={uniform_min:.6}"
4483 );
4484
4485 // Focused run must rank the focus node significantly higher than others
4486 // but others must remain non-trivial (≥ 5% of focus).
4487 let focus_rank = ranks_focused[3];
4488 for (i, &r) in ranks_focused.iter().enumerate().filter(|&(i, _)| i != 3) {
4489 assert!(
4490 r >= focus_rank * 0.05,
4491 "rank[{i}]={r:.6} is < 5% of focus rank {focus_rank:.6}; \
4492 soft personalization must preserve non-focus ranks"
4493 );
4494 }
4495 }
4496
4497 // ── T1 tests — focus_file resolver normalization (I#20) ──────────────
4498
4499 /// Build a tiny synthetic graph whose `FileNode::path` values match the
4500 /// shape `build_graph` produces on disk (no leading `./`, forward slashes).
4501 fn focus_resolver_graph() -> RepoGraph {
4502 let file_nodes: Vec<FileNode> = vec![
4503 FileNode {
4504 path: "device_opt/services/storage.py".to_string(),
4505 defs: vec![],
4506 imports: vec![],
4507 },
4508 FileNode {
4509 path: "device_opt/ui/textual/screens/settings.py".to_string(),
4510 defs: vec![],
4511 imports: vec![],
4512 },
4513 FileNode {
4514 path: "device_opt/services/registry.py".to_string(),
4515 defs: vec![],
4516 imports: vec![],
4517 },
4518 FileNode {
4519 path: "tests/test_storage.py".to_string(),
4520 defs: vec![],
4521 imports: vec![],
4522 },
4523 ];
4524 let n = file_nodes.len();
4525 RepoGraph {
4526 files: file_nodes,
4527 edges: vec![],
4528 base_ranks: vec![1.0 / n as f32; n],
4529 callers: vec![vec![]; n],
4530 callees: vec![vec![]; n],
4531 def_edges: vec![],
4532 def_ranks: vec![],
4533 def_callers: vec![],
4534 def_callees: vec![],
4535 def_offsets: vec![0; n + 1],
4536 alpha: 0.5,
4537 }
4538 }
4539
4540 /// T1: focus_file paths emitted by `lsp_location.file_path` (with the
4541 /// `./` prefix) must resolve to the correct file index.
4542 ///
4543 /// Baseline reproduction (mnemosyne corpus, 4.0.5): passing
4544 /// `focus_file="./device_opt/.../settings.py"` produced rank values
4545 /// bit-identical to the unfocused call because the strict-suffix matcher
4546 /// in `tools.rs` failed both the `exact` and the `strict_suffix` checks
4547 /// when the focus carried the LSP `./` prefix. The matcher silently
4548 /// returned `focus = None`, masking the failure as "topic-sensitive
4549 /// PageRank does nothing on Python".
4550 #[test]
4551 fn test_focus_file_resolver_accepts_lsp_location_path() {
4552 let g = focus_resolver_graph();
4553 // LSP-shaped path with leading `./` — the form documented in
4554 // get_repo_map's instructions.
4555 let res = g.resolve_focus_file("./device_opt/ui/textual/screens/settings.py");
4556 match res {
4557 FocusResolution::Found(idx) => {
4558 assert_eq!(
4559 g.files[idx].path, "device_opt/ui/textual/screens/settings.py",
4560 "resolver must accept the ./-prefixed LSP path form (I#20)"
4561 );
4562 }
4563 FocusResolution::NotFound | FocusResolution::Ambiguous(_) => {
4564 panic!(
4565 "resolver returned {res:?} for ./device_opt/ui/textual/screens/settings.py; \
4566 the LSP-shaped path form must resolve to exactly one file (I#20)"
4567 );
4568 }
4569 }
4570 }
4571
4572 /// T1: the bare stored path (no `./`) must continue to resolve.
4573 /// Regression guard for the pre-fix matcher's "exact" path.
4574 #[test]
4575 fn test_focus_file_resolver_accepts_bare_stored_path() {
4576 let g = focus_resolver_graph();
4577 let res = g.resolve_focus_file("device_opt/services/storage.py");
4578 match res {
4579 FocusResolution::Found(idx) => {
4580 assert_eq!(g.files[idx].path, "device_opt/services/storage.py");
4581 }
4582 other => panic!("expected Found, got {other:?}"),
4583 }
4584 }
4585
4586 /// T1: strict-suffix match — `storage.py` must match
4587 /// `device_opt/services/storage.py` (prev char is `/`) but ambiguity
4588 /// (two `storage.py` files) must be reported, not silently picked.
4589 #[test]
4590 fn test_focus_file_resolver_strict_suffix_and_ambiguity() {
4591 let g = focus_resolver_graph();
4592 // "storage.py" matches both device_opt/services/storage.py and
4593 // tests/test_storage.py? No — test_storage.py has `_` before `storage.py`
4594 // (not `/`), so the strict-suffix matcher rejects it. Only one match.
4595 let res = g.resolve_focus_file("storage.py");
4596 assert!(
4597 matches!(res, FocusResolution::Found(_)),
4598 "strict-suffix `storage.py` must match exactly one file (the `_` in \
4599 test_storage.py blocks the strict-suffix), got {res:?}"
4600 );
4601 // Add a second services/storage.py-shaped file to force ambiguity.
4602 let mut g2 = g.clone();
4603 g2.files.push(FileNode {
4604 path: "vendored/services/storage.py".to_string(),
4605 defs: vec![],
4606 imports: vec![],
4607 });
4608 g2.base_ranks.push(0.0);
4609 g2.callers.push(vec![]);
4610 g2.callees.push(vec![]);
4611 g2.def_offsets.push(*g2.def_offsets.last().unwrap());
4612 let res = g2.resolve_focus_file("storage.py");
4613 match res {
4614 FocusResolution::Ambiguous(cands) => {
4615 assert_eq!(cands.len(), 2, "expected two candidates, got {cands:?}");
4616 }
4617 other => panic!("expected Ambiguous, got {other:?}"),
4618 }
4619 }
4620
4621 /// T1: a focus that matches no file returns `NotFound`. The caller
4622 /// is responsible for either treating this as unfocused or surfacing
4623 /// an error — the resolver itself does not impose policy.
4624 #[test]
4625 fn test_focus_file_resolver_not_found() {
4626 let g = focus_resolver_graph();
4627 let res = g.resolve_focus_file("./does/not/exist.py");
4628 assert!(
4629 matches!(res, FocusResolution::NotFound),
4630 "expected NotFound, got {res:?}"
4631 );
4632 }
4633
4634 /// T1: empty focus does not match anything (avoids the empty-suffix
4635 /// degenerate that would otherwise match every file).
4636 #[test]
4637 fn test_focus_file_resolver_empty_input_is_not_found() {
4638 let g = focus_resolver_graph();
4639 let res = g.resolve_focus_file("");
4640 assert!(
4641 matches!(res, FocusResolution::NotFound),
4642 "empty focus must not match anything, got {res:?}"
4643 );
4644 }
4645
4646 /// T1: focus_file rank delta must be visible on a Python-shaped
4647 /// synthetic graph.
4648 ///
4649 /// Builds a small Python-style call graph (FileNode + Definition with
4650 /// resolved CallRefs, matching what `extract_calls` produces on a real
4651 /// Python corpus), runs `build_graph_from_files_pub` to get a
4652 /// `RepoGraph`, then calls `render_json_budgeted` with and without
4653 /// focus. Asserts that the focused call changes the rank of at least
4654 /// one non-focus file by ≥ 5% in either direction.
4655 ///
4656 /// On the baseline (pre-T1) this test passes when the caller supplies
4657 /// an int focus_idx directly — the engine's topic-sensitive PageRank is
4658 /// correct. The bug was at the string-to-int resolver layer in
4659 /// `tools.rs`, which silently masked the failure as "the rendering
4660 /// path doesn't propagate focus". This test locks the engine's
4661 /// behavior so a future regression in the rendering path is caught.
4662 #[test]
4663 #[expect(
4664 clippy::too_many_lines,
4665 reason = "synthetic Python-shaped graph (five FileNodes with defs + \
4666 CallRefs + ImportRefs) plus the two-call assertion sequence \
4667 is inherently long; splitting into helpers would obscure the \
4668 one-shot reproduction the test is locking in."
4669 )]
4670 fn test_focus_file_rank_delta_visible_on_python_corpus() {
4671 // Five files, Python-shaped: services/storage.py (a "hub" that two
4672 // UI files call into) plus a tests/ file. The Python tree-sitter
4673 // extractor produces `class_definition` and `function_definition`
4674 // kinds with resolved CallRefs pointing at the hub.
4675 let mut files: Vec<FileNode> = vec![
4676 FileNode {
4677 path: "device_opt/services/storage.py".to_string(),
4678 defs: vec![
4679 Definition {
4680 name: "ScanStore".to_string(),
4681 kind: "class_definition".to_string(),
4682 start_line: 1,
4683 end_line: 80,
4684 scope: String::new(),
4685 signature: None,
4686 start_byte: 0,
4687 end_byte: 2000,
4688 calls: vec![],
4689 },
4690 Definition {
4691 name: "save_scan".to_string(),
4692 kind: "function_definition".to_string(),
4693 start_line: 20,
4694 end_line: 40,
4695 scope: "class_definition ScanStore".to_string(),
4696 signature: Some("def save_scan(self, scan)".to_string()),
4697 start_byte: 200,
4698 end_byte: 600,
4699 calls: vec![],
4700 },
4701 ],
4702 imports: vec![],
4703 },
4704 FileNode {
4705 path: "device_opt/services/registry.py".to_string(),
4706 defs: vec![Definition {
4707 name: "register".to_string(),
4708 kind: "function_definition".to_string(),
4709 start_line: 1,
4710 end_line: 30,
4711 scope: String::new(),
4712 signature: Some("def register(svc)".to_string()),
4713 start_byte: 0,
4714 end_byte: 600,
4715 calls: vec![CallRef {
4716 name: "save_scan".to_string(),
4717 qualified_path: None,
4718 receiver_type: None,
4719 byte_offset: 100,
4720 resolved: None,
4721 }],
4722 }],
4723 imports: vec![ImportRef {
4724 raw_path: "from device_opt.services import storage".to_string(),
4725 resolved_idx: Some(0),
4726 }],
4727 },
4728 FileNode {
4729 path: "device_opt/ui/screens/browse.py".to_string(),
4730 defs: vec![Definition {
4731 name: "browse_scans".to_string(),
4732 kind: "function_definition".to_string(),
4733 start_line: 1,
4734 end_line: 50,
4735 scope: String::new(),
4736 signature: Some("def browse_scans(app)".to_string()),
4737 start_byte: 0,
4738 end_byte: 1000,
4739 calls: vec![CallRef {
4740 name: "save_scan".to_string(),
4741 qualified_path: None,
4742 receiver_type: None,
4743 byte_offset: 200,
4744 resolved: None,
4745 }],
4746 }],
4747 imports: vec![ImportRef {
4748 raw_path: "from device_opt.services import storage".to_string(),
4749 resolved_idx: Some(0),
4750 }],
4751 },
4752 FileNode {
4753 path: "device_opt/ui/screens/settings.py".to_string(),
4754 defs: vec![Definition {
4755 name: "open_settings".to_string(),
4756 kind: "function_definition".to_string(),
4757 start_line: 1,
4758 end_line: 40,
4759 scope: String::new(),
4760 signature: Some("def open_settings(app)".to_string()),
4761 start_byte: 0,
4762 end_byte: 800,
4763 calls: vec![CallRef {
4764 name: "register".to_string(),
4765 qualified_path: None,
4766 receiver_type: None,
4767 byte_offset: 150,
4768 resolved: None,
4769 }],
4770 }],
4771 imports: vec![ImportRef {
4772 raw_path: "from device_opt.services import registry".to_string(),
4773 resolved_idx: Some(1),
4774 }],
4775 },
4776 FileNode {
4777 path: "tests/test_storage.py".to_string(),
4778 defs: vec![Definition {
4779 name: "test_save".to_string(),
4780 kind: "function_definition".to_string(),
4781 start_line: 1,
4782 end_line: 20,
4783 scope: String::new(),
4784 signature: Some("def test_save()".to_string()),
4785 start_byte: 0,
4786 end_byte: 400,
4787 calls: vec![CallRef {
4788 name: "save_scan".to_string(),
4789 qualified_path: None,
4790 receiver_type: None,
4791 byte_offset: 50,
4792 resolved: None,
4793 }],
4794 }],
4795 imports: vec![ImportRef {
4796 raw_path: "from device_opt.services import storage".to_string(),
4797 resolved_idx: Some(0),
4798 }],
4799 },
4800 ];
4801
4802 // Resolve calls so the graph builder has edges to chew on.
4803 let def_index = build_def_index(&files);
4804 resolve_calls(&mut files, &def_index, &HashMap::new());
4805 let graph = build_graph_from_files_pub(files);
4806
4807 // Sanity: the graph must have edges (the calls were resolved).
4808 assert!(
4809 !graph.edges.is_empty(),
4810 "Python-shaped synthetic graph must produce file-level edges; got 0. \
4811 The CallRefs may have failed to resolve."
4812 );
4813
4814 // Resolve the focus file via the new helper.
4815 let focus_idx = match graph.resolve_focus_file("./device_opt/ui/screens/settings.py") {
4816 FocusResolution::Found(i) => i,
4817 other => panic!("resolver must find settings.py via LSP-shaped path, got {other:?}"),
4818 };
4819
4820 let budget = 4000;
4821 let unfocused = render_json_budgeted(&graph, budget, None, false);
4822 let focused = render_json_budgeted(&graph, budget, Some(focus_idx), false);
4823
4824 // Collect rank-by-path maps for both runs.
4825 let unfocused_ranks: std::collections::HashMap<String, f32> = unfocused
4826 .files
4827 .iter()
4828 .map(|f| (f.lsp_location.file_path.clone(), f.rank))
4829 .collect();
4830 let focused_ranks: std::collections::HashMap<String, f32> = focused
4831 .files
4832 .iter()
4833 .map(|f| (f.lsp_location.file_path.clone(), f.rank))
4834 .collect();
4835
4836 eprintln!("T1 Python — unfocused ranks: {unfocused_ranks:#?}");
4837 eprintln!("T1 Python — focused ranks: {focused_ranks:#?}");
4838
4839 // Find at least one non-focus file whose rank changed by ≥ 5% in
4840 // either direction. The threshold is conservative; the soft 0.15
4841 // personalization alpha redistributes mass enough that on this
4842 // 5-node graph the affected neighbors typically shift by 20%+.
4843 let focus_path = "./device_opt/ui/screens/settings.py";
4844 let mut max_delta_ratio = 0.0_f32;
4845 for (path, &u_rank) in &unfocused_ranks {
4846 if path == focus_path {
4847 continue;
4848 }
4849 if let Some(&f_rank) = focused_ranks.get(path)
4850 && u_rank > 0.0
4851 {
4852 let ratio = (f_rank - u_rank).abs() / u_rank;
4853 if ratio > max_delta_ratio {
4854 max_delta_ratio = ratio;
4855 }
4856 }
4857 }
4858 assert!(
4859 max_delta_ratio >= 0.05,
4860 "focus_file must rebias non-focus file ranks by ≥ 5%; \
4861 max observed delta ratio = {max_delta_ratio:.3} \
4862 (I#20: focus_file invisible on Python corpora)"
4863 );
4864
4865 // Bit-identity guard: at least one non-focus file's rank must NOT
4866 // equal its unfocused value. This is the pathology from the
4867 // mnemosyne reproduction: every rank value was bit-identical
4868 // across global/focused calls.
4869 let any_changed = unfocused_ranks.iter().any(|(path, &u_rank)| {
4870 path != focus_path
4871 && focused_ranks
4872 .get(path)
4873 .is_some_and(|&f_rank| f_rank.to_bits() != u_rank.to_bits())
4874 });
4875 assert!(
4876 any_changed,
4877 "no non-focus file rank changed across focused/unfocused calls — \
4878 bit-identical pathology (I#20). unfocused={unfocused_ranks:#?} \
4879 focused={focused_ranks:#?}"
4880 );
4881 }
4882
4883 /// T1: focus_file rank delta on a Rust-shaped synthetic graph.
4884 ///
4885 /// Regression test: confirms the engine's topic-sensitive PageRank
4886 /// works on Rust shapes (where T1's investigation found it already
4887 /// works, but the resolver fix must not break the existing path).
4888 ///
4889 /// This complements `test_focus_file_returns_neighborhood_not_just_focus`
4890 /// by additionally checking that (a) the resolver accepts a Rust path
4891 /// with the `./` LSP prefix, and (b) at least one non-focus file's
4892 /// rank moves by ≥ 5%.
4893 #[test]
4894 #[expect(
4895 clippy::too_many_lines,
4896 reason = "synthetic Rust-shaped graph with four FileNodes plus the \
4897 two-call assertion sequence inherently exceeds the 100-line \
4898 cap; the test mirrors the Python-shaped sibling."
4899 )]
4900 fn test_focus_file_rank_delta_preserved_on_rust_corpus() {
4901 let mut files: Vec<FileNode> = vec![
4902 FileNode {
4903 path: "src/lib.rs".to_string(),
4904 defs: vec![Definition {
4905 name: "Engine".to_string(),
4906 kind: "struct_item".to_string(),
4907 start_line: 1,
4908 end_line: 30,
4909 scope: String::new(),
4910 signature: None,
4911 start_byte: 0,
4912 end_byte: 600,
4913 calls: vec![],
4914 }],
4915 imports: vec![],
4916 },
4917 FileNode {
4918 path: "src/encoder/mod.rs".to_string(),
4919 defs: vec![Definition {
4920 name: "encode".to_string(),
4921 kind: "function_item".to_string(),
4922 start_line: 1,
4923 end_line: 40,
4924 scope: String::new(),
4925 signature: Some("fn encode(input: &str) -> Vec<f32>".to_string()),
4926 start_byte: 0,
4927 end_byte: 800,
4928 calls: vec![],
4929 }],
4930 imports: vec![ImportRef {
4931 raw_path: "use crate::lib;".to_string(),
4932 resolved_idx: Some(0),
4933 }],
4934 },
4935 FileNode {
4936 path: "src/search.rs".to_string(),
4937 defs: vec![Definition {
4938 name: "search".to_string(),
4939 kind: "function_item".to_string(),
4940 start_line: 1,
4941 end_line: 30,
4942 scope: String::new(),
4943 signature: Some("fn search(q: &str) -> Hits".to_string()),
4944 start_byte: 0,
4945 end_byte: 600,
4946 calls: vec![CallRef {
4947 name: "encode".to_string(),
4948 qualified_path: None,
4949 receiver_type: None,
4950 byte_offset: 100,
4951 resolved: None,
4952 }],
4953 }],
4954 imports: vec![ImportRef {
4955 raw_path: "use crate::encoder;".to_string(),
4956 resolved_idx: Some(1),
4957 }],
4958 },
4959 FileNode {
4960 path: "src/cli.rs".to_string(),
4961 defs: vec![Definition {
4962 name: "main".to_string(),
4963 kind: "function_item".to_string(),
4964 start_line: 1,
4965 end_line: 20,
4966 scope: String::new(),
4967 signature: Some("fn main()".to_string()),
4968 start_byte: 0,
4969 end_byte: 400,
4970 calls: vec![CallRef {
4971 name: "search".to_string(),
4972 qualified_path: None,
4973 receiver_type: None,
4974 byte_offset: 50,
4975 resolved: None,
4976 }],
4977 }],
4978 imports: vec![ImportRef {
4979 raw_path: "use crate::search;".to_string(),
4980 resolved_idx: Some(2),
4981 }],
4982 },
4983 ];
4984
4985 let def_index = build_def_index(&files);
4986 resolve_calls(&mut files, &def_index, &HashMap::new());
4987 let graph = build_graph_from_files_pub(files);
4988
4989 assert!(
4990 !graph.edges.is_empty(),
4991 "Rust-shaped synthetic graph must produce edges"
4992 );
4993
4994 let focus_idx = match graph.resolve_focus_file("./src/encoder/mod.rs") {
4995 FocusResolution::Found(i) => i,
4996 other => panic!("resolver must find encoder/mod.rs via LSP path, got {other:?}"),
4997 };
4998
4999 let budget = 4000;
5000 let unfocused = render_json_budgeted(&graph, budget, None, false);
5001 let focused = render_json_budgeted(&graph, budget, Some(focus_idx), false);
5002
5003 let unfocused_ranks: std::collections::HashMap<String, f32> = unfocused
5004 .files
5005 .iter()
5006 .map(|f| (f.lsp_location.file_path.clone(), f.rank))
5007 .collect();
5008 let focused_ranks: std::collections::HashMap<String, f32> = focused
5009 .files
5010 .iter()
5011 .map(|f| (f.lsp_location.file_path.clone(), f.rank))
5012 .collect();
5013
5014 eprintln!("T1 Rust — unfocused: {unfocused_ranks:#?}");
5015 eprintln!("T1 Rust — focused: {focused_ranks:#?}");
5016
5017 let focus_path = "./src/encoder/mod.rs";
5018 let mut max_delta_ratio = 0.0_f32;
5019 for (path, &u_rank) in &unfocused_ranks {
5020 if path == focus_path {
5021 continue;
5022 }
5023 if let Some(&f_rank) = focused_ranks.get(path)
5024 && u_rank > 0.0
5025 {
5026 let ratio = (f_rank - u_rank).abs() / u_rank;
5027 if ratio > max_delta_ratio {
5028 max_delta_ratio = ratio;
5029 }
5030 }
5031 }
5032 assert!(
5033 max_delta_ratio >= 0.05,
5034 "focus_file must rebias non-focus file ranks by ≥ 5% on Rust shapes; \
5035 max observed delta = {max_delta_ratio:.3}"
5036 );
5037 }
5038
5039 #[test]
5040 fn test_pagerank_empty() {
5041 let ranks = pagerank(0, &[], None);
5042 assert!(ranks.is_empty());
5043 }
5044
5045 #[test]
5046 fn test_render_tiers() {
5047 // Build a small graph with 10 files to exercise all tiers
5048 let files: Vec<FileNode> = (0..10)
5049 .map(|i| FileNode {
5050 path: format!("src/file_{i}.rs"),
5051 defs: vec![Definition {
5052 name: format!("func_{i}"),
5053 kind: "function_item".to_string(),
5054 start_line: 1,
5055 end_line: 5,
5056 scope: String::new(),
5057 signature: Some(format!("func_{i}(x: i32) -> i32")),
5058 start_byte: 0,
5059 end_byte: 0,
5060 calls: vec![],
5061 }],
5062 imports: vec![],
5063 })
5064 .collect();
5065
5066 // Create a star graph: files 1-9 all import from file 0
5067 let edges: Vec<(u32, u32, u32)> = (1..10).map(|i| (i, 0, 1)).collect();
5068 let base_ranks = pagerank(10, &edges, None);
5069 let (top_callers, top_callees) = build_neighbor_lists(10, &edges);
5070
5071 let graph = RepoGraph {
5072 files,
5073 edges,
5074 base_ranks,
5075 callers: top_callers,
5076 callees: top_callees,
5077 def_edges: vec![],
5078 def_ranks: vec![],
5079 def_callers: vec![],
5080 def_callees: vec![],
5081 def_offsets: vec![0],
5082 alpha: 0.5,
5083 };
5084
5085 // Large budget: should include all files
5086 let full = render(&graph, 10_000, None);
5087 assert!(
5088 full.contains("file_0"),
5089 "output should contain the top-ranked file"
5090 );
5091 // file_0 should appear as tier 0 (highest rank)
5092 assert!(
5093 full.contains("## src/file_0.rs"),
5094 "top file should have tier 0 heading"
5095 );
5096
5097 // Tiny budget: should only fit a few files
5098 let small = render(&graph, 10, None);
5099 assert!(
5100 !small.is_empty(),
5101 "even tiny budget should produce some output"
5102 );
5103 // Should have fewer entries than full render
5104 let full_lines = full.lines().count();
5105 let small_lines = small.lines().count();
5106 assert!(
5107 small_lines < full_lines,
5108 "small budget ({small_lines} lines) should have fewer lines than full ({full_lines})"
5109 );
5110 }
5111
5112 #[test]
5113 fn test_render_empty_graph() {
5114 let graph = RepoGraph {
5115 files: vec![],
5116 edges: vec![],
5117 base_ranks: vec![],
5118 callers: vec![],
5119 callees: vec![],
5120 def_edges: vec![],
5121 def_ranks: vec![],
5122 def_callers: vec![],
5123 def_callees: vec![],
5124 def_offsets: vec![0],
5125 alpha: 0.5,
5126 };
5127 let output = render(&graph, 1000, None);
5128 assert!(output.is_empty(), "empty graph should render empty string");
5129 }
5130
5131 #[test]
5132 fn test_build_graph_on_fixtures() {
5133 let fixtures = Path::new(env!("CARGO_MANIFEST_DIR"))
5134 .parent()
5135 .unwrap()
5136 .parent()
5137 .unwrap()
5138 .join("tests")
5139 .join("fixtures");
5140
5141 let graph = build_graph(&fixtures).expect("build_graph should succeed on fixtures");
5142
5143 // Should find at least the 3 fixture files
5144 assert!(
5145 !graph.files.is_empty(),
5146 "graph should contain files from fixtures"
5147 );
5148
5149 // Should find definitions in the Rust fixture
5150 let rs_file = graph.files.iter().find(|f| f.path.ends_with("sample.rs"));
5151 assert!(rs_file.is_some(), "should find sample.rs");
5152 let rs_file = rs_file.unwrap();
5153 assert!(
5154 !rs_file.defs.is_empty(),
5155 "sample.rs should have definitions"
5156 );
5157 assert!(
5158 rs_file.defs.iter().any(|d| d.name == "hello"),
5159 "should find 'hello' function in sample.rs"
5160 );
5161
5162 // Should find definitions in the Python fixture
5163 let py_file = graph.files.iter().find(|f| f.path.ends_with("sample.py"));
5164 assert!(py_file.is_some(), "should find sample.py");
5165 let py_file = py_file.unwrap();
5166 assert!(
5167 !py_file.defs.is_empty(),
5168 "sample.py should have definitions"
5169 );
5170 assert!(
5171 py_file.defs.iter().any(|d| d.name == "greet"),
5172 "should find 'greet' function in sample.py"
5173 );
5174
5175 // PageRank scores should be computed
5176 assert_eq!(graph.base_ranks.len(), graph.files.len());
5177 let sum: f32 = graph.base_ranks.iter().sum();
5178 assert!(
5179 (sum - 1.0).abs() < 0.01,
5180 "PageRank scores should sum to ~1.0, got {sum}"
5181 );
5182 }
5183
5184 #[test]
5185 fn test_extract_imports_rust() {
5186 let source = "use crate::foo::bar;\nuse std::collections::HashMap;\n";
5187 let (lang, query) = import_query_for_extension("rs").unwrap();
5188 let imports = extract_imports(source, &lang, &query);
5189 assert_eq!(imports.len(), 2);
5190 assert!(imports[0].contains("crate::foo::bar"));
5191 }
5192
5193 #[test]
5194 fn test_extract_imports_python_stub() {
5195 let source = "from typing import Protocol\nimport pkg.types\n";
5196 let (lang, query) = import_query_for_extension("pyi").unwrap();
5197 let imports = extract_imports(source, &lang, &query);
5198 assert_eq!(imports.len(), 2);
5199 assert!(imports[0].contains("from typing import Protocol"));
5200 assert!(imports[1].contains("import pkg.types"));
5201 }
5202
5203 #[test]
5204 fn test_resolve_python_import_to_stub_file() {
5205 let root = PathBuf::from("/project");
5206 let mut file_index = HashMap::new();
5207 file_index.insert(PathBuf::from("/project/pkg/types.pyi"), 1);
5208
5209 let result = resolve_python_import("import pkg.types", &root, &file_index);
5210 assert_eq!(result, Some(1));
5211 }
5212
5213 #[test]
5214 fn test_resolve_rust_crate_import() {
5215 let root = PathBuf::from("/project");
5216 let file_path = PathBuf::from("/project/src/main.rs");
5217 let mut file_index = HashMap::new();
5218 file_index.insert(PathBuf::from("/project/src/foo/bar.rs"), 1);
5219 file_index.insert(PathBuf::from("/project/src/main.rs"), 0);
5220
5221 let result = resolve_rust_import("use crate::foo::bar;", &file_path, &root, &file_index);
5222 assert_eq!(result, Some(1));
5223 }
5224
5225 #[test]
5226 fn test_resolve_rust_external_crate_dropped() {
5227 let root = PathBuf::from("/project");
5228 let file_path = PathBuf::from("/project/src/main.rs");
5229 let file_index = HashMap::new();
5230
5231 let result = resolve_rust_import(
5232 "use std::collections::HashMap;",
5233 &file_path,
5234 &root,
5235 &file_index,
5236 );
5237 assert_eq!(result, None, "external crate imports should be dropped");
5238 }
5239
5240 #[test]
5241 fn test_neighbor_lists() {
5242 // 0 -> 1, 0 -> 2, 1 -> 2
5243 let edges = vec![(0, 1, 1), (0, 2, 1), (1, 2, 1)];
5244 let (incoming, outgoing) = build_neighbor_lists(3, &edges);
5245
5246 // Node 2 should be called by 0 and 1
5247 assert!(incoming[2].contains(&0));
5248 assert!(incoming[2].contains(&1));
5249
5250 // Node 0 should call 1 and 2
5251 assert!(outgoing[0].contains(&1));
5252 assert!(outgoing[0].contains(&2));
5253 }
5254
5255 /// G1 (R2.3 issue a): A scoped call `mod_a::foo()` must store:
5256 /// - `name = "foo"` (bare identifier, for def-index lookup)
5257 /// - `qualified_path = Some("mod_a::foo")` (full path, for disambiguation)
5258 ///
5259 /// Before G1, `name` stored the full `"mod_a::foo"` path. After G1, `name`
5260 /// is always the bare trailing identifier and `qualified_path` carries the
5261 /// full path when the call is scoped.
5262 #[test]
5263 fn test_scoped_identifier_calls_preserve_path() {
5264 use crate::languages;
5265 use streaming_iterator::StreamingIterator as _;
5266
5267 let source = "
5268mod mod_a {
5269 pub fn foo() {}
5270}
5271mod mod_b {
5272 pub fn foo() {}
5273}
5274fn caller() {
5275 mod_a::foo();
5276 mod_b::foo();
5277}
5278";
5279 let call_config =
5280 languages::call_query_for_extension("rs").expect("Rust call config must exist");
5281 let lang_config =
5282 languages::config_for_extension("rs").expect("Rust lang config must exist");
5283
5284 let mut defs = {
5285 let mut parser = tree_sitter::Parser::new();
5286 parser.set_language(&lang_config.language).unwrap();
5287 let tree = parser.parse(source, None).unwrap();
5288 let mut cursor = tree_sitter::QueryCursor::new();
5289 let mut out = Vec::new();
5290 let mut matches =
5291 cursor.matches(&lang_config.query, tree.root_node(), source.as_bytes());
5292 while let Some(m) = matches.next() {
5293 let mut name = String::new();
5294 let mut def_node = None;
5295 for cap in m.captures {
5296 let cname = &lang_config.query.capture_names()[cap.index as usize];
5297 if *cname == "name" {
5298 name = source[cap.node.start_byte()..cap.node.end_byte()].to_string();
5299 } else if *cname == "def" {
5300 def_node = Some(cap.node);
5301 }
5302 }
5303 if let Some(node) = def_node {
5304 #[expect(clippy::cast_possible_truncation)]
5305 out.push(Definition {
5306 name,
5307 kind: node.kind().to_string(),
5308 start_line: node.start_position().row as u32 + 1,
5309 end_line: node.end_position().row as u32 + 1,
5310 scope: String::new(),
5311 signature: None,
5312 start_byte: node.start_byte() as u32,
5313 end_byte: node.end_byte() as u32,
5314 calls: vec![],
5315 });
5316 }
5317 }
5318 out
5319 };
5320
5321 extract_calls(source, &call_config, &mut defs);
5322
5323 // Find the `caller` function definition
5324 let caller_def = defs
5325 .iter()
5326 .find(|d| d.name == "caller")
5327 .expect("caller def");
5328
5329 // G1: bare name is "foo", qualified_path carries the module path.
5330 let call_names: Vec<&str> = caller_def.calls.iter().map(|c| c.name.as_str()).collect();
5331 let qualified_paths: Vec<Option<&str>> = caller_def
5332 .calls
5333 .iter()
5334 .map(|c| c.qualified_path.as_deref())
5335 .collect();
5336
5337 // Bare names must be the trailing identifier only.
5338 assert!(
5339 call_names.contains(&"foo"),
5340 "bare name 'foo' must appear for scoped calls; got: {call_names:?}"
5341 );
5342 // Qualified paths must carry the full scope.
5343 assert!(
5344 qualified_paths.contains(&Some("mod_a::foo")),
5345 "qualified_path 'mod_a::foo' must appear; got: {qualified_paths:?}"
5346 );
5347 assert!(
5348 qualified_paths.contains(&Some("mod_b::foo")),
5349 "qualified_path 'mod_b::foo' must appear; got: {qualified_paths:?}"
5350 );
5351 // Full paths must NOT appear in bare names.
5352 assert!(
5353 !call_names.contains(&"mod_a::foo"),
5354 "full path 'mod_a::foo' must not appear in bare name; got: {call_names:?}"
5355 );
5356 }
5357
5358 /// RED test (R2.3 issue b+c): Two defs named `Read` in different modules,
5359 /// an unqualified call to `Read`. Resolution must NOT silently pick the first.
5360 /// Either both are returned (ambiguous) or none.
5361 #[test]
5362 fn test_ambiguous_name_resolution_returns_all_or_none() {
5363 // Build two FileNodes each with a def named "Read", then a third with an
5364 // unqualified call to "Read".
5365 let file_a = FileNode {
5366 path: "mod_a.rs".to_string(),
5367 defs: vec![Definition {
5368 name: "Read".to_string(),
5369 kind: "trait_item".to_string(),
5370 start_line: 1,
5371 end_line: 3,
5372 scope: String::new(),
5373 signature: None,
5374 start_byte: 0,
5375 end_byte: 50,
5376 calls: vec![],
5377 }],
5378 imports: vec![],
5379 };
5380 let file_b = FileNode {
5381 path: "mod_b.rs".to_string(),
5382 defs: vec![Definition {
5383 name: "Read".to_string(),
5384 kind: "trait_item".to_string(),
5385 start_line: 1,
5386 end_line: 3,
5387 scope: String::new(),
5388 signature: None,
5389 start_byte: 0,
5390 end_byte: 50,
5391 calls: vec![],
5392 }],
5393 imports: vec![],
5394 };
5395 let file_c = FileNode {
5396 path: "caller.rs".to_string(),
5397 defs: vec![Definition {
5398 name: "do_thing".to_string(),
5399 kind: "function_item".to_string(),
5400 start_line: 1,
5401 end_line: 5,
5402 scope: String::new(),
5403 signature: None,
5404 start_byte: 0,
5405 end_byte: 100,
5406 calls: vec![CallRef {
5407 name: "Read".to_string(),
5408 qualified_path: None,
5409 receiver_type: None,
5410 byte_offset: 10,
5411 resolved: None,
5412 }],
5413 }],
5414 imports: vec![],
5415 };
5416
5417 let mut files = vec![file_a, file_b, file_c];
5418 let def_index = build_def_index(&files);
5419 resolve_calls(&mut files, &def_index, &HashMap::new());
5420
5421 // The unqualified call to "Read" is ambiguous (two candidates, neither in same
5422 // file nor imported). Resolution must leave it as None — silent first-wins is wrong.
5423 let resolved = files[2].defs[0].calls[0].resolved;
5424 assert_eq!(
5425 resolved, None,
5426 "ambiguous unqualified call with no import context must resolve to None, not silently pick first"
5427 );
5428 }
5429
5430 // ── D1 / D2 tests ────────────────────────────────────────────────
5431
5432 /// Build a small test graph with N files and an optional JSON-extension file.
5433 fn build_test_graph(n_code: usize, include_json: bool) -> (RepoGraph, Vec<usize>) {
5434 let mut file_nodes: Vec<FileNode> = (0..n_code)
5435 .map(|i| FileNode {
5436 path: format!("src/file_{i}.rs"),
5437 defs: vec![
5438 Definition {
5439 name: format!("func_{i}"),
5440 kind: "function_item".to_string(),
5441 start_line: 1,
5442 end_line: 5,
5443 scope: String::new(),
5444 signature: Some(format!("fn func_{i}() -> i32")),
5445 start_byte: 0,
5446 end_byte: 100,
5447 calls: vec![],
5448 },
5449 Definition {
5450 name: format!("MyStruct{i}"),
5451 kind: "struct_item".to_string(),
5452 start_line: 7,
5453 end_line: 10,
5454 scope: String::new(),
5455 signature: None,
5456 start_byte: 110,
5457 end_byte: 200,
5458 calls: vec![],
5459 },
5460 ],
5461 imports: vec![],
5462 })
5463 .collect();
5464
5465 let json_idx = if include_json {
5466 let idx = file_nodes.len();
5467 file_nodes.push(FileNode {
5468 path: "data/config.json".to_string(),
5469 defs: vec![],
5470 imports: vec![],
5471 });
5472 vec![idx]
5473 } else {
5474 vec![]
5475 };
5476
5477 // Build a star graph: all code files point to file_0.
5478 let n = file_nodes.len();
5479 #[expect(clippy::cast_possible_truncation, reason = "test: n_code << u32::MAX")]
5480 let edges: Vec<(u32, u32, u32)> = (1..n_code).map(|i| (i as u32, 0, 1)).collect();
5481
5482 let base_ranks = pagerank(n, &edges, None);
5483 let (callers, callees) = build_neighbor_lists(n, &edges);
5484
5485 let graph = RepoGraph {
5486 files: file_nodes,
5487 edges,
5488 base_ranks,
5489 callers,
5490 callees,
5491 def_edges: vec![],
5492 def_ranks: vec![],
5493 def_callers: vec![],
5494 def_callees: vec![],
5495 def_offsets: vec![0],
5496 alpha: 0.5,
5497 };
5498
5499 (graph, json_idx)
5500 }
5501
5502 /// D1: `render_json` returns a `GetRepoMapResponse` with a `files` array.
5503 ///
5504 /// On the baseline (before D1) `get_repo_map_ripvec` returned markdown prose via
5505 /// `repo_map::render`; no `files` key existed in the output.
5506 #[test]
5507 fn get_repo_map_returns_json_with_files_array() {
5508 let (graph, _) = build_test_graph(5, false);
5509 let response = render_json(&graph, 50, None, false);
5510 assert!(
5511 !response.files.is_empty(),
5512 "files array should be non-empty for a non-empty graph"
5513 );
5514 // Serialize and verify the JSON shape has a `files` key.
5515 let json = serde_json::to_string(&response).expect("serialize");
5516 let parsed: serde_json::Value = serde_json::from_str(&json).expect("parse");
5517 assert!(
5518 parsed["files"].is_array(),
5519 "serialized response must have a `files` JSON array; got: {parsed}"
5520 );
5521 }
5522
5523 /// D1: every file entry has an `lsp_location` field.
5524 ///
5525 /// Before D1, output was prose text; no `lsp_location` existed anywhere in the response.
5526 #[test]
5527 fn get_repo_map_each_file_has_lsp_location() {
5528 let (graph, _) = build_test_graph(5, false);
5529 let response = render_json(&graph, 50, None, false);
5530 for file in &response.files {
5531 assert!(
5532 !file.lsp_location.file_path.is_empty(),
5533 "each file must have a non-empty lsp_location.file_path"
5534 );
5535 }
5536 // Also verify through JSON.
5537 let json = serde_json::to_string(&response).expect("serialize");
5538 let parsed: serde_json::Value = serde_json::from_str(&json).expect("parse");
5539 for entry in parsed["files"].as_array().expect("files array") {
5540 assert!(
5541 entry["lsp_location"]["file_path"].is_string(),
5542 "each file entry must have lsp_location.file_path string; entry: {entry}"
5543 );
5544 }
5545 }
5546
5547 /// D1: every symbol has a `kind` (u32) and an `lsp_location`.
5548 ///
5549 /// Before D1 symbols were rendered as prose strings like `"function_item func_0"`.
5550 #[test]
5551 fn get_repo_map_each_symbol_has_kind_and_lsp_location() {
5552 let (graph, _) = build_test_graph(3, false);
5553 let response = render_json(&graph, 50, None, false);
5554 for file in &response.files {
5555 for sym in &file.symbols {
5556 assert!(
5557 sym.kind > 0,
5558 "symbol kind must be a positive LSP SymbolKind; got 0 for '{}'",
5559 sym.name
5560 );
5561 assert!(
5562 !sym.lsp_location.file_path.is_empty(),
5563 "symbol must have lsp_location.file_path"
5564 );
5565 }
5566 }
5567 // Verify through JSON: kind should be a number.
5568 let json = serde_json::to_string(&response).expect("serialize");
5569 let parsed: serde_json::Value = serde_json::from_str(&json).expect("parse");
5570 for file_entry in parsed["files"].as_array().expect("files") {
5571 for sym_entry in file_entry["symbols"].as_array().expect("symbols") {
5572 assert!(
5573 sym_entry["kind"].is_number(),
5574 "symbol `kind` must be a JSON number; sym: {sym_entry}"
5575 );
5576 assert!(
5577 sym_entry["lsp_location"]["file_path"].is_string(),
5578 "symbol must have lsp_location.file_path; sym: {sym_entry}"
5579 );
5580 }
5581 }
5582 }
5583
5584 /// D1: `calls` field is an array of `RepoMapCall`-shaped objects (each has
5585 /// `lsp_location` and `rank`).
5586 ///
5587 /// In 4.0.1 calls moved from bare `lsp_location` objects to `RepoMapCall`
5588 /// objects that carry both the target `lsp_location` and the target file's
5589 /// `base_rank`.
5590 #[test]
5591 fn get_repo_map_calls_field_is_array_of_lsp_locations() {
5592 // Build a 5-file star graph so file_0 has non-empty callees.
5593 let (graph, _) = build_test_graph(5, false);
5594 let response = render_json(&graph, 50, None, false);
5595 let json = serde_json::to_string(&response).expect("serialize");
5596 let parsed: serde_json::Value = serde_json::from_str(&json).expect("parse");
5597 for file_entry in parsed["files"].as_array().expect("files") {
5598 let calls = file_entry["calls"]
5599 .as_array()
5600 .expect("calls must be an array");
5601 for call in calls {
5602 // In 4.0.1 each call entry is a RepoMapCall with lsp_location + rank.
5603 assert!(
5604 call["lsp_location"]["file_path"].is_string(),
5605 "each call entry must have lsp_location.file_path string; call: {call}"
5606 );
5607 assert!(
5608 call["rank"].is_number(),
5609 "each call entry must have a numeric rank; call: {call}"
5610 );
5611 }
5612 }
5613 }
5614
5615 /// D2 / G3: `render_json_budgeted` with a very tight budget returns fewer files.
5616 ///
5617 /// Before the budget allocator, `max_files=3` controlled file count but not
5618 /// per-file expansion. In 4.0.1 the token_budget controls total bytes; with
5619 /// a budget of 1 token (= 4 bytes) only the envelope minimum allows any file
5620 /// at all, and the test verifies that the total_files counter still reflects
5621 /// the full eligible count. `render_json` (compat shim) passes a generous
5622 /// budget; use `render_json_budgeted` with a tight budget to verify the cap.
5623 #[test]
5624 fn get_repo_map_returns_at_most_max_files_files() {
5625 let (graph, _) = build_test_graph(10, false);
5626 // Use render_json_budgeted directly with a tight budget (600 bytes = 3 files
5627 // × 200-byte floor). Each file's envelope minimum is 200 bytes so a 600-byte
5628 // budget should admit at most 3 files.
5629 let response = render_json_budgeted(&graph, 150, None, false);
5630 assert!(
5631 response.files.len() <= 3,
5632 "files.len() = {} must be <= 3 for a 600-byte budget",
5633 response.files.len()
5634 );
5635 assert_eq!(
5636 response.total_files, 10,
5637 "total_files must reflect the full eligible count before budget cap"
5638 );
5639 assert!(
5640 response.capped,
5641 "capped must be true when total_files > files.len()"
5642 );
5643 }
5644
5645 /// D2: `include_metadata=false` (default) excludes JSON/TOML/etc. files.
5646 ///
5647 /// Before D2, JSON files with thousands of repeated keys dominated the
5648 /// output (Issue #5 — JSON-key flooding).
5649 #[test]
5650 fn get_repo_map_excludes_meta_by_default() {
5651 let (graph, _) = build_test_graph(3, /*include_json=*/ true);
5652 // Default: include_metadata = false
5653 let response = render_json(&graph, 50, None, false);
5654 for file in &response.files {
5655 assert!(
5656 !std::path::Path::new(&file.lsp_location.file_path)
5657 .extension()
5658 .is_some_and(|e| e.eq_ignore_ascii_case("json")),
5659 "JSON (Meta) files must be excluded when include_metadata=false; found: {}",
5660 file.lsp_location.file_path
5661 );
5662 }
5663 }
5664
5665 /// D2: `include_metadata=true` includes JSON files.
5666 ///
5667 /// Callers who opt-in to metadata should see all content kinds.
5668 #[test]
5669 fn get_repo_map_include_metadata_true_includes_json() {
5670 let (graph, _) = build_test_graph(3, /*include_json=*/ true);
5671 let response = render_json(&graph, 50, None, true);
5672 let has_json = response.files.iter().any(|f| {
5673 std::path::Path::new(&f.lsp_location.file_path)
5674 .extension()
5675 .is_some_and(|e| e.eq_ignore_ascii_case("json"))
5676 });
5677 assert!(
5678 has_json,
5679 "JSON file must be present when include_metadata=true"
5680 );
5681 }
5682
5683 /// J1/J2 MEASUREMENT: flask corpus focus_file=blueprints.py rank dispersion.
5684 ///
5685 /// Mandatory measurement from the 4.0.5 Wave-2 Front-C briefing:
5686 /// - `len(files) >= 8` (not collapsed to just the focus)
5687 /// - focus file rank is the highest in the response
5688 /// - next 5 files all have rank >= 10% of focus rank
5689 /// - neighborhood contains semantically related files (app.py, scaffold.py)
5690 #[test]
5691 #[ignore = "runs on flask corpus at tests/corpus/code/flask; use --ignored --nocapture"]
5692 #[expect(
5693 clippy::too_many_lines,
5694 reason = "end-to-end corpus measurement test; assertion sequence is sequential and cannot be meaningfully split"
5695 )]
5696 fn test_flask_focus_blueprints_rank_dispersion() {
5697 let corpus_root = Path::new(env!("CARGO_MANIFEST_DIR"))
5698 .parent()
5699 .unwrap()
5700 .parent()
5701 .unwrap()
5702 .join("tests/corpus/code/flask");
5703
5704 assert!(
5705 corpus_root.exists(),
5706 "flask corpus not found at {}",
5707 corpus_root.display()
5708 );
5709
5710 let graph = build_graph(&corpus_root).expect("build_graph on flask corpus");
5711 eprintln!("Flask corpus: {} files in graph", graph.files.len());
5712
5713 // Find focus file
5714 let focus_path = "src/flask/blueprints.py";
5715 let focus_idx = graph.files.iter().position(|f| f.path == focus_path);
5716 eprintln!("Focus file '{focus_path}' -> idx: {focus_idx:?}");
5717 assert!(
5718 focus_idx.is_some(),
5719 "blueprints.py not found in graph; available files: {:?}",
5720 graph
5721 .files
5722 .iter()
5723 .map(|f| &f.path)
5724 .take(20)
5725 .collect::<Vec<_>>()
5726 );
5727
5728 let response = render_json_budgeted(&graph, 4000, focus_idx, false);
5729
5730 // Criterion 1: at least 8 files returned.
5731 eprintln!(
5732 "Focused response: {} files (total_files={})",
5733 response.files.len(),
5734 response.total_files
5735 );
5736 assert!(
5737 response.files.len() >= 8,
5738 "expected >= 8 files in focused response; got {} — I#16 winner-take-all collapse",
5739 response.files.len()
5740 );
5741
5742 // Print top 10 for inspection.
5743 eprintln!("\nTop 10 focused files:");
5744 for (i, f) in response.files.iter().take(10).enumerate() {
5745 eprintln!(" [{i}] rank={:.6} {}", f.rank, f.lsp_location.file_path);
5746 }
5747
5748 // Criterion 2: focus file must appear near the top (top-3) of focused
5749 // results. With PERSONALIZATION_ALPHA=0.15 and the flask corpus,
5750 // src/flask/app.py has higher structural rank than blueprints.py and
5751 // may legitimately rank #1 — the focus boosts blueprints.py relative
5752 // to its unfocused position, but doesn't guarantee it beats every
5753 // structurally central neighbor. Being in top-3 confirms the bias
5754 // is working (pre-fix blueprints.py was #1 at 0.703 but that was a
5755 // degenerate collapse; now #1 or #2 is healthy).
5756 let focus_file_rank = response
5757 .files
5758 .iter()
5759 .find(|f| {
5760 f.lsp_location.file_path.contains("blueprints.py")
5761 && !f.lsp_location.file_path.contains("test_")
5762 && !f.lsp_location.file_path.contains("sansio")
5763 })
5764 .map(|f| f.rank)
5765 .unwrap_or(0.0);
5766 let focus_position = response
5767 .files
5768 .iter()
5769 .position(|f| {
5770 f.lsp_location.file_path.contains("blueprints.py")
5771 && !f.lsp_location.file_path.contains("test_")
5772 && !f.lsp_location.file_path.contains("sansio")
5773 })
5774 .unwrap_or(usize::MAX);
5775 eprintln!(
5776 "\nblueprinets.py position: #{} rank={:.6}",
5777 focus_position + 1,
5778 focus_file_rank
5779 );
5780 assert!(
5781 focus_position < 3,
5782 "blueprints.py must be in top-3 focused results (got position {}); \
5783 soft personalization must rebias toward focus neighborhood — I#16",
5784 focus_position + 1
5785 );
5786
5787 // Criterion 3: next 5 non-focus files have rank >= 10% of the top
5788 // file's rank. This is the core dispersion check: no more Dirac-delta
5789 // collapse where one file is 0.703 and all others are 0.003.
5790 let top_rank = response.files[0].rank;
5791 let non_focus_min_5 = response
5792 .files
5793 .iter()
5794 .filter(|f| {
5795 !(f.lsp_location.file_path.contains("blueprints.py")
5796 && !f.lsp_location.file_path.contains("test_")
5797 && !f.lsp_location.file_path.contains("sansio"))
5798 })
5799 .take(5)
5800 .map(|f| f.rank)
5801 .fold(f32::INFINITY, f32::min);
5802 let pct = non_focus_min_5 / top_rank * 100.0;
5803 eprintln!(
5804 "\nNext-5 (non-focus) min rank: {non_focus_min_5:.6} = {pct:.1}% of top rank {top_rank:.6}"
5805 );
5806 assert!(
5807 pct >= 10.0,
5808 "next-5 non-focus files min rank is {pct:.1}% of top rank (need ≥ 10%); \
5809 files are collapsing to near-zero floor — I#16"
5810 );
5811
5812 // Criterion 4: neighborhood quality — related files present.
5813 let related_names = ["app.py", "scaffold.py", "sansio"];
5814 let found_related: Vec<&str> = related_names
5815 .iter()
5816 .copied()
5817 .filter(|name| {
5818 response
5819 .files
5820 .iter()
5821 .any(|f| f.lsp_location.file_path.contains(name))
5822 })
5823 .collect();
5824 eprintln!("\nNeighborhood quality: found related files: {found_related:?}");
5825 // At least one related file should appear (soft assertion — log if missing).
5826 if found_related.is_empty() {
5827 eprintln!(
5828 "WARNING: no expected related files (app.py, scaffold.py) found in neighborhood"
5829 );
5830 }
5831 }
5832
5833 #[test]
5834 #[ignore = "runs on full ripvec codebase; use --nocapture to see output"]
5835 fn test_full_repo_map() {
5836 use std::time::Instant;
5837
5838 let root = Path::new(env!("CARGO_MANIFEST_DIR"))
5839 .parent()
5840 .unwrap()
5841 .parent()
5842 .unwrap();
5843
5844 // Phase 1: build_graph (walk + parse + import resolve + PageRank)
5845 let t0 = Instant::now();
5846 let graph = build_graph(root).expect("build_graph on ripvec root");
5847 let build_ms = t0.elapsed().as_secs_f64() * 1000.0;
5848
5849 // Phase 2: render (default, no focus)
5850 let t1 = Instant::now();
5851 let rendered = render(&graph, 2000, None);
5852 let render_ms = t1.elapsed().as_secs_f64() * 1000.0;
5853
5854 // Phase 3: render (topic-sensitive, focused on highest-ranked file)
5855 let t2 = Instant::now();
5856 let focus_idx = graph
5857 .base_ranks
5858 .iter()
5859 .enumerate()
5860 .max_by(|a, b| a.1.total_cmp(b.1))
5861 .map(|(i, _)| i);
5862 let focused = render(&graph, 2000, focus_idx);
5863 let focus_ms = t2.elapsed().as_secs_f64() * 1000.0;
5864
5865 eprintln!("\n=== Repo Map Performance ===");
5866 eprintln!(
5867 "Files: {}, Edges: {}, Defs: {}",
5868 graph.files.len(),
5869 graph.edges.len(),
5870 graph.files.iter().map(|f| f.defs.len()).sum::<usize>()
5871 );
5872 eprintln!("build_graph: {build_ms:.1}ms (walk + parse + resolve + PageRank)");
5873 eprintln!(
5874 "render(default): {render_ms:.3}ms ({} chars, ~{} tokens)",
5875 rendered.len(),
5876 rendered.len() / 4
5877 );
5878 eprintln!(
5879 "render(focused): {focus_ms:.3}ms ({} chars, ~{} tokens)",
5880 focused.len(),
5881 focused.len() / 4
5882 );
5883
5884 eprintln!("\nTop 5 by PageRank:");
5885 let mut ranked: Vec<(usize, f32)> = graph.base_ranks.iter().copied().enumerate().collect();
5886 ranked.sort_by(|a, b| b.1.total_cmp(&a.1));
5887 for (i, rank) in ranked.iter().take(5) {
5888 eprintln!(" {:.4} {}", rank, graph.files[*i].path);
5889 }
5890
5891 eprintln!("\n=== Default Render ===\n{rendered}");
5892 eprintln!(
5893 "\n=== Focused Render (on {}) ===\n{focused}",
5894 focus_idx
5895 .map(|i| graph.files[i].path.as_str())
5896 .unwrap_or("none")
5897 );
5898 }
5899}