ripvec_core/repo_map.rs
1//! `PageRank`-weighted structural overview of a codebase.
2//!
3//! Builds a dependency graph from tree-sitter definition and import extraction,
4//! ranks files by importance using `PageRank` (standard or topic-sensitive), and
5//! renders a budget-constrained overview with tiered detail levels.
6
7use std::collections::HashMap;
8use std::fmt::Write as _;
9use std::path::{Path, PathBuf};
10
11use rayon::prelude::*;
12use rkyv::{Archive, Deserialize as RkyvDeserialize, Serialize as RkyvSerialize};
13use streaming_iterator::StreamingIterator;
14use tree_sitter::{Parser, Query, QueryCursor};
15
16use serde::Serialize;
17
18use crate::chunk::ContentKind;
19use crate::languages;
20use crate::walk;
21
22/// Serialize a `ContentKind` to a lowercase string tag for JSON output.
23fn content_kind_tag(ck: ContentKind) -> &'static str {
24 match ck {
25 ContentKind::Code => "code",
26 ContentKind::Docs => "docs",
27 ContentKind::Meta => "meta",
28 }
29}
30
31// ── Data Structures ──────────────────────────────────────────────────
32
33/// Persisted dependency graph with `PageRank` scores.
34#[derive(Debug, Clone, Archive, RkyvSerialize, RkyvDeserialize)]
35pub struct RepoGraph {
36 /// Files in the repository with definitions, imports, and calls.
37 pub files: Vec<FileNode>,
38 /// File-level edges (derived from def-level call edges).
39 pub edges: Vec<(u32, u32, u32)>,
40 /// File-level `PageRank` scores (aggregated from def-level).
41 pub base_ranks: Vec<f32>,
42 /// File-level callers (indices into `files`).
43 pub callers: Vec<Vec<u32>>,
44 /// File-level callees (indices into `files`).
45 pub callees: Vec<Vec<u32>>,
46 /// Definition-level call edges: `(caller_def, callee_def, weight)`.
47 pub def_edges: Vec<(DefId, DefId, u32)>,
48 /// Definition-level `PageRank` scores (flattened: `offsets[file_idx] + def_idx`).
49 pub def_ranks: Vec<f32>,
50 /// Definition-level callers (flattened, parallel to `def_ranks`).
51 pub def_callers: Vec<Vec<DefId>>,
52 /// Definition-level callees (flattened, parallel to `def_ranks`).
53 pub def_callees: Vec<Vec<DefId>>,
54 /// Prefix-sum offsets for flattening `DefId` to linear index.
55 pub def_offsets: Vec<usize>,
56 /// Auto-tuned alpha for search boost.
57 pub alpha: f32,
58}
59
60/// A file in the repository with its definitions and imports.
61#[derive(Debug, Clone, Archive, RkyvSerialize, RkyvDeserialize)]
62pub struct FileNode {
63 /// Relative path from the repository root.
64 pub path: String,
65 /// Definitions (functions, structs, classes, etc.) extracted from this file.
66 pub defs: Vec<Definition>,
67 /// Import references extracted from this file.
68 pub imports: Vec<ImportRef>,
69}
70
71/// A definition extracted from a source file.
72#[derive(Debug, Clone, Archive, RkyvSerialize, RkyvDeserialize)]
73pub struct Definition {
74 /// Name of the definition (e.g., function name, class name).
75 pub name: String,
76 /// Kind of syntax node (e.g., `function_item`, `class_definition`).
77 pub kind: String,
78 /// 1-based start line number.
79 pub start_line: u32,
80 /// 1-based end line number.
81 pub end_line: u32,
82 /// Scope chain (e.g., `"impl_item Foo > fn bar"`).
83 pub scope: String,
84 /// Function/method signature, if available.
85 pub signature: Option<String>,
86 /// Byte offset of this definition's start in the source file.
87 pub start_byte: u32,
88 /// Byte offset of this definition's end in the source file.
89 pub end_byte: u32,
90 /// Call sites within this definition's body.
91 pub calls: Vec<CallRef>,
92}
93
94/// An import reference extracted from a source file.
95#[derive(Debug, Clone, Archive, RkyvSerialize, RkyvDeserialize)]
96pub struct ImportRef {
97 /// Raw import path as written in source (e.g., `crate::foo::bar`).
98 pub raw_path: String,
99 /// Resolved file index in [`RepoGraph::files`], if resolution succeeded.
100 pub resolved_idx: Option<u32>,
101}
102
103/// Unique identifier for a definition: (file index, definition index within file).
104pub type DefId = (u32, u16);
105
106/// A call site extracted from a definition body.
107#[derive(Debug, Clone, Default, Archive, RkyvSerialize, RkyvDeserialize)]
108pub struct CallRef {
109 /// Callee function/method name (bare, without qualifier).
110 ///
111 /// For scoped calls like `mod_a::foo()`, this is `"foo"`.
112 /// For bare calls like `foo()`, this is `"foo"`.
113 pub name: String,
114 /// Full qualified path for scoped calls, e.g. `Some("mod_a::foo")`.
115 ///
116 /// `None` for bare (unqualified) calls. When `Some`, `resolve_calls`
117 /// uses this for qualifier-based module disambiguation before falling
118 /// back to the bare `name`.
119 pub qualified_path: Option<String>,
120 /// Receiver type for method calls, inferred from local context.
121 ///
122 /// Set to `Some("Foo")` when:
123 /// - The call is `self.method()` inside `impl Foo { … }`.
124 /// - The call is `x.method()` where `x` has an explicit type annotation `x: Foo`.
125 /// - The call is `x.method()` after `let x = Foo::new()`.
126 ///
127 /// `None` for free function calls, or when the receiver type cannot be
128 /// inferred from local context alone. When `Some`, `resolve_calls` prefers
129 /// defs whose enclosing impl scope matches the receiver type.
130 pub receiver_type: Option<String>,
131 /// Byte offset of the call in the source file (for scoping to definitions).
132 pub byte_offset: u32,
133 /// Resolved target definition, if resolution succeeded.
134 pub resolved: Option<DefId>,
135}
136
137// ── JSON output types ────────────────────────────────────────────────
138
139/// LSP-shaped location pointing at a file or symbol within a file.
140///
141/// Lines and characters are 0-based, matching the Language Server Protocol
142/// convention so callers can pass this directly to LSP tools without any
143/// conversion.
144#[derive(Debug, Clone, Serialize)]
145pub struct RepoMapLspLocation {
146 /// Relative path from the repository root (prefixed with `./`).
147 pub file_path: String,
148 /// 0-based start line.
149 pub start_line: usize,
150 /// 0-based start character (0 for file-level locations).
151 pub start_character: usize,
152 /// 0-based end line (equals `start_line` for file-level locations).
153 pub end_line: usize,
154 /// 0-based end character (0 for file-level locations).
155 pub end_character: usize,
156}
157
158/// A top-level symbol extracted from a file in the repository map.
159///
160/// Analogous to an LSP `DocumentSymbol` but limited to the fields available
161/// from tree-sitter definition extraction. The `rank` field carries the
162/// definition-level `PageRank` score from [`RepoGraph::def_ranks`], enabling
163/// callers to prioritise symbols by structural importance.
164#[derive(Debug, Clone, Serialize)]
165pub struct RepoMapSymbol {
166 /// Symbol name (function name, struct name, etc.).
167 pub name: String,
168 /// LSP `SymbolKind` as a decimal — use the same values as
169 /// `lsp_workspace_symbols` and `lsp_document_symbols`.
170 pub kind: u32,
171 /// Location pointing at the symbol's definition line (0-based).
172 pub lsp_location: RepoMapLspLocation,
173 /// Definition-level `PageRank` score from [`RepoGraph::def_ranks`].
174 ///
175 /// Higher values indicate definitions that are called by many other
176 /// definitions. Used by the token-budget allocator to decide which
177 /// symbols to include when the per-file budget is constrained.
178 pub rank: f32,
179}
180
181/// An outgoing call-edge from a file to another file.
182///
183/// Carries both the target file's `lsp_location` and its `base_rank`
184/// (file-level `PageRank` score) so callers can decide how important
185/// each dependency is without a separate lookup.
186#[derive(Debug, Clone, Serialize)]
187pub struct RepoMapCall {
188 /// Location pointing at the target file (line 0, character 0).
189 pub lsp_location: RepoMapLspLocation,
190 /// File-level `PageRank` score of the target file.
191 pub rank: f32,
192}
193
194/// One file entry in the JSON repo map.
195///
196/// Carries the file's `PageRank` score, content kind, outgoing call-edges to
197/// other files, and the file's top-level symbol definitions — all with
198/// `lsp_location` so the caller can chain directly into LSP tools without
199/// any destructuring.
200#[derive(Debug, Clone, Serialize)]
201pub struct RepoMapFile {
202 /// Location pointing at the file itself (line 0, character 0).
203 ///
204 /// Pass `lsp_location.file_path` directly into `lsp_document_symbols` or
205 /// any other file-scoped tool.
206 pub lsp_location: RepoMapLspLocation,
207 /// `PageRank` score in [0, 1] (higher = more structurally central).
208 pub rank: f32,
209 /// Content classification: `"code"`, `"docs"`, or `"meta"`.
210 ///
211 /// Serialized as a lowercase string tag so JSON consumers can branch
212 /// without numeric magic values. Mirrors the `ContentKind` enum in
213 /// `ripvec-core::chunk`.
214 pub content_kind: &'static str,
215 /// Outgoing call-edges sorted by target file `PageRank` descending.
216 pub calls: Vec<RepoMapCall>,
217 /// Top-level definitions extracted from this file by tree-sitter,
218 /// sorted by definition-level `PageRank` descending and pruned to
219 /// the per-file token-budget allocation.
220 pub symbols: Vec<RepoMapSymbol>,
221 /// Number of symbols that were omitted due to budget exhaustion or
222 /// logarithmic attenuation cutoff. `truncated_symbols + symbols.len()`
223 /// equals the total definition count for the file.
224 pub truncated_symbols: usize,
225 /// Number of call-edges that were omitted due to budget exhaustion
226 /// or logarithmic attenuation cutoff. `truncated_calls + calls.len()`
227 /// equals the total callee count for the file.
228 pub truncated_calls: usize,
229}
230
231/// JSON-mode response envelope for `get_repo_map` (4.0.1 shape).
232///
233/// Replaces the `max_files`-capped shape from 4.0.0. The caller supplies a
234/// `token_budget`; files are allocated bytes proportional to their `PageRank`
235/// (40% cap per file, 200-byte envelope floor). Symbols are filled in
236/// def-rank order with a logarithmic attenuation cutoff. Leftover bytes
237/// cascade to subsequent files.
238///
239/// The `estimated_bytes`, `budget_bytes`, and `budget_exhausted` fields give
240/// callers real-time feedback on how tightly the budget was consumed.
241#[derive(Debug, Clone, Serialize)]
242pub struct GetRepoMapResponse {
243 /// Files sorted by `PageRank` descending, pruned to the token budget.
244 pub files: Vec<RepoMapFile>,
245 /// Total number of eligible files in the graph (pre-allocation).
246 ///
247 /// If `total_files > files.len()`, the budget ran out before all files
248 /// could be included. Read `budget_exhausted` directly for the boolean.
249 pub total_files: usize,
250 /// Actual serialised-JSON byte count for all returned content.
251 pub estimated_bytes: usize,
252 /// Budget ceiling in bytes that was used for allocation
253 /// (`token_budget * 4`).
254 pub budget_bytes: usize,
255 /// `true` when `total_files > files.len()` (budget was exhausted before
256 /// all eligible files were included).
257 pub budget_exhausted: bool,
258 /// Retained for backward compatibility with 4.0.0 callers that checked
259 /// `capped`. Equivalent to `budget_exhausted`.
260 pub capped: bool,
261}
262
263// ── Constants ────────────────────────────────────────────────────────
264
265/// `PageRank` damping factor.
266const DAMPING: f32 = 0.85;
267
268/// `PageRank` convergence threshold.
269const EPSILON: f32 = 1e-6;
270
271/// Maximum `PageRank` iterations.
272const MAX_ITERATIONS: usize = 100;
273
274/// Maximum callers/callees stored per file.
275const MAX_NEIGHBORS: usize = 5;
276
277/// Approximate characters per token for budget estimation.
278const CHARS_PER_TOKEN: usize = 4;
279
280/// Concentration mass placed on the focus node in topic-sensitive `PageRank`.
281///
282/// Following Haveliwala 2002 ("Topic-Sensitive PageRank"), the personalization
283/// vector places a small bias `α` on the focus node and distributes the
284/// remaining `1 - α` uniformly over all other nodes. This preserves rank
285/// dispersion across the corpus — the user sees a *neighborhood* of related
286/// files rebiased toward the focus, not a Dirac delta on the focus node with
287/// every other file collapsed to an equally negligible uniform floor.
288///
289/// Value 0.15 means:
290/// - focus node teleportation probability = 0.15
291/// - each of the (n - 1) other nodes = 0.85 / (n - 1)
292///
293/// Prior to 4.0.5, this constant was effectively 0.70 (70% bias on focus),
294/// which caused winner-take-all collapse observed on the flask and ripvec
295/// corpora (I#16).
296const PERSONALIZATION_ALPHA: f32 = 0.15;
297
298// ── Import Queries ───────────────────────────────────────────────────
299
300/// Compile a tree-sitter import query for the given extension.
301///
302/// Returns `None` for unsupported extensions.
303fn import_query_for_extension(ext: &str) -> Option<(tree_sitter::Language, Query)> {
304 let (lang, query_str): (tree_sitter::Language, &str) = match ext {
305 "rs" => (
306 tree_sitter_rust::LANGUAGE.into(),
307 "(use_declaration) @import",
308 ),
309 "py" | "pyi" => (
310 tree_sitter_python::LANGUAGE.into(),
311 concat!(
312 "(import_statement) @import\n",
313 "(import_from_statement) @import",
314 ),
315 ),
316 "js" | "jsx" => (
317 tree_sitter_javascript::LANGUAGE.into(),
318 "(import_statement source: (string) @import_path) @import",
319 ),
320 "ts" => (
321 tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into(),
322 "(import_statement source: (string) @import_path) @import",
323 ),
324 "tsx" => (
325 tree_sitter_typescript::LANGUAGE_TSX.into(),
326 "(import_statement source: (string) @import_path) @import",
327 ),
328 "go" => (
329 tree_sitter_go::LANGUAGE.into(),
330 "(import_spec path: (interpreted_string_literal) @import_path) @import",
331 ),
332 // Ruby: require statements.
333 "rb" => (
334 tree_sitter_ruby::LANGUAGE.into(),
335 "(call method: (identifier) @_method arguments: (argument_list (string (string_content) @import_path)) (#eq? @_method \"require\")) @import",
336 ),
337 _ => return None,
338 };
339 let query = match Query::new(&lang, query_str) {
340 Ok(q) => q,
341 Err(e) => {
342 tracing::warn!(ext, %e, "import query compilation failed — language may be ABI-incompatible");
343 return None;
344 }
345 };
346 Some((lang, query))
347}
348
349/// Extract import paths from source using tree-sitter.
350fn extract_imports(
351 source: &str,
352 lang: &tree_sitter::Language,
353 import_query: &Query,
354) -> Vec<String> {
355 let mut parser = Parser::new();
356 if parser.set_language(lang).is_err() {
357 return vec![];
358 }
359 let Some(tree) = parser.parse(source, None) else {
360 return vec![];
361 };
362
363 let mut cursor = QueryCursor::new();
364 let mut imports = Vec::new();
365 let mut matches = cursor.matches(import_query, tree.root_node(), source.as_bytes());
366
367 while let Some(m) = matches.next() {
368 // Prefer @import_path capture (JS/TS/Go), fall back to full @import text
369 let mut import_path_text = None;
370 let mut import_text = None;
371
372 for cap in m.captures {
373 let cap_name = &import_query.capture_names()[cap.index as usize];
374 let text = &source[cap.node.start_byte()..cap.node.end_byte()];
375 if *cap_name == "import_path" {
376 import_path_text = Some(text.trim_matches(|c| c == '"' || c == '\''));
377 } else if *cap_name == "import" {
378 import_text = Some(text);
379 }
380 }
381
382 if let Some(path) = import_path_text {
383 imports.push(path.to_string());
384 } else if let Some(text) = import_text {
385 imports.push(text.to_string());
386 }
387 }
388
389 imports
390}
391
392// ── Import Resolution ────────────────────────────────────────────────
393
394/// Resolve a Rust `use` path to a file index in the file map.
395///
396/// Handles `crate::`, `self::`, and `super::` prefixes. External crate
397/// imports are dropped (returns `None`).
398fn resolve_rust_import(
399 raw: &str,
400 file_path: &Path,
401 root: &Path,
402 file_index: &HashMap<PathBuf, usize>,
403) -> Option<usize> {
404 // Extract the module path from `use crate::foo::bar;` or `use crate::foo::bar::Baz;`
405 let trimmed = raw
406 .trim()
407 .trim_start_matches("use ")
408 .trim_end_matches(';')
409 .trim();
410
411 let segments: Vec<&str> = trimmed.split("::").collect();
412 if segments.is_empty() {
413 return None;
414 }
415
416 // Determine the base directory and skip prefix segments
417 let (base, skip) = match segments[0] {
418 "crate" => {
419 // Find the nearest Cargo.toml ancestor to determine the crate root.
420 // In a workspace, `crate::foo` resolves relative to the crate's src/,
421 // not the workspace root.
422 let mut dir = file_path.parent();
423 let crate_root = loop {
424 match dir {
425 Some(d) if d.join("Cargo.toml").exists() => break d.join("src"),
426 Some(d) => dir = d.parent(),
427 None => break root.join("src"), // fallback
428 }
429 };
430 (crate_root, 1)
431 }
432 "self" => {
433 let dir = file_path.parent()?;
434 (dir.to_path_buf(), 1)
435 }
436 "super" => {
437 let dir = file_path.parent()?.parent()?;
438 (dir.to_path_buf(), 1)
439 }
440 // External crate — drop
441 _ => return None,
442 };
443
444 // Build candidate paths from the remaining segments.
445 // Try progressively shorter prefixes since the last segments
446 // may be items (struct, fn) rather than modules.
447 let path_segments = &segments[skip..];
448 for end in (1..=path_segments.len()).rev() {
449 let mut candidate = base.clone();
450 for seg in &path_segments[..end] {
451 // Strip glob patterns like `{Foo, Bar}`
452 let clean = seg.split('{').next().unwrap_or(seg).trim();
453 if !clean.is_empty() {
454 candidate.push(clean);
455 }
456 }
457
458 // Try file.rs
459 let as_file = candidate.with_extension("rs");
460 if let Some(&idx) = file_index.get(&as_file) {
461 return Some(idx);
462 }
463
464 // Try dir/mod.rs
465 let as_mod = candidate.join("mod.rs");
466 if let Some(&idx) = file_index.get(&as_mod) {
467 return Some(idx);
468 }
469 }
470
471 None
472}
473
474/// Resolve an import path to a file index based on file extension.
475fn resolve_import(
476 raw: &str,
477 ext: &str,
478 file_path: &Path,
479 root: &Path,
480 file_index: &HashMap<PathBuf, usize>,
481) -> Option<usize> {
482 match ext {
483 "rs" => resolve_rust_import(raw, file_path, root, file_index),
484 "py" | "pyi" => resolve_python_import(raw, root, file_index),
485 "js" | "jsx" | "ts" | "tsx" => resolve_js_import(raw, file_path, file_index),
486 // Go imports use full package paths — skip local resolution
487 _ => None,
488 }
489}
490
491/// Resolve a Python import to a file index.
492///
493/// Handles `import foo.bar` and `from foo.bar import baz` patterns.
494fn resolve_python_import(
495 raw: &str,
496 root: &Path,
497 file_index: &HashMap<PathBuf, usize>,
498) -> Option<usize> {
499 let module_path = if let Some(rest) = raw.strip_prefix("from ") {
500 rest.split_whitespace().next()?
501 } else if let Some(rest) = raw.strip_prefix("import ") {
502 rest.split_whitespace().next()?
503 } else {
504 return None;
505 };
506
507 let rel_path: PathBuf = module_path.split('.').collect();
508 for ext in ["py", "pyi"] {
509 let as_file = root.join(&rel_path).with_extension(ext);
510 if let Some(&idx) = file_index.get(&as_file) {
511 return Some(idx);
512 }
513 }
514
515 for init_name in ["__init__.py", "__init__.pyi"] {
516 let as_init = root.join(&rel_path).join(init_name);
517 if let Some(&idx) = file_index.get(&as_init) {
518 return Some(idx);
519 }
520 }
521
522 None
523}
524
525/// Resolve a JS/TS import to a file index.
526///
527/// Handles relative paths like `./foo` or `../bar`.
528fn resolve_js_import(
529 raw: &str,
530 file_path: &Path,
531 file_index: &HashMap<PathBuf, usize>,
532) -> Option<usize> {
533 if !raw.starts_with('.') {
534 return None;
535 }
536
537 let dir = file_path.parent()?;
538 let candidate = dir.join(raw);
539
540 for ext in &["js", "jsx", "ts", "tsx"] {
541 let with_ext = candidate.with_extension(ext);
542 if let Some(&idx) = file_index.get(&with_ext) {
543 return Some(idx);
544 }
545 }
546
547 for ext in &["js", "jsx", "ts", "tsx"] {
548 let index_file = candidate.join("index").with_extension(ext);
549 if let Some(&idx) = file_index.get(&index_file) {
550 return Some(idx);
551 }
552 }
553
554 None
555}
556
557// ── Extraction ───────────────────────────────────────────────────────
558
559/// Extract definitions from a source file using tree-sitter.
560fn extract_definitions(source: &str, config: &languages::LangConfig) -> Vec<Definition> {
561 let mut parser = Parser::new();
562 if parser.set_language(&config.language).is_err() {
563 return vec![];
564 }
565 let Some(tree) = parser.parse(source, None) else {
566 return vec![];
567 };
568
569 let mut cursor = QueryCursor::new();
570 let mut defs = Vec::new();
571 let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
572
573 while let Some(m) = matches.next() {
574 let mut name = String::new();
575 let mut def_node = None;
576
577 for cap in m.captures {
578 let cap_name = &config.query.capture_names()[cap.index as usize];
579 if *cap_name == "name" {
580 name = source[cap.node.start_byte()..cap.node.end_byte()].to_string();
581 } else if *cap_name == "def" {
582 def_node = Some(cap.node);
583 }
584 }
585
586 if let Some(node) = def_node {
587 let scope = crate::chunk::build_scope_chain(node, source);
588 let signature = crate::chunk::extract_signature(node, source);
589 #[expect(clippy::cast_possible_truncation, reason = "line numbers fit in u32")]
590 let start_line = node.start_position().row as u32 + 1;
591 #[expect(clippy::cast_possible_truncation, reason = "line numbers fit in u32")]
592 let end_line = node.end_position().row as u32 + 1;
593 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
594 let start_byte = node.start_byte() as u32;
595 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
596 let end_byte = node.end_byte() as u32;
597 defs.push(Definition {
598 name,
599 kind: node.kind().to_string(),
600 start_line,
601 end_line,
602 scope,
603 signature,
604 start_byte,
605 end_byte,
606 calls: vec![],
607 });
608 }
609 }
610
611 defs
612}
613
614// ── Call Extraction & Resolution ────────────────────────────────────
615
616/// Tiebreak priority for def attribution when two defs share the same byte span.
617///
618/// Returns `0` for function-like defs (lowest value = wins in `min_by_key`) and
619/// `1` for structural container defs (class bodies, impl blocks, etc.).
620///
621/// This resolves the Python case where the class body `block` and the first
622/// `function_definition` inside it occupy identical byte ranges; calls inside
623/// the function body should be attributed to the function, not the class block.
624fn is_callable_def_priority(kind: &str) -> u8 {
625 match kind {
626 // Function / method defs: these are the correct attribution targets.
627 "function_item"
628 | "function_definition"
629 | "function_declaration"
630 | "function_signature_item"
631 | "method_definition"
632 | "method_declaration"
633 | "method" => 0,
634 // Structural containers: class body blocks, impl items, etc.
635 // Prefer function-like defs over these when byte ranges tie.
636 _ => 1,
637 }
638}
639
640/// Extract call sites from a source file and assign them to definitions.
641///
642/// Uses the language's call query to find all call expressions, then
643/// assigns each call to the definition whose byte range contains it.
644/// Calls outside any definition body (module-level) are ignored.
645///
646/// For Rust scoped calls (`a::b::foo()`), the `@callee` capture returns the
647/// full `scoped_identifier` node. This function splits it into:
648/// - `name` = bare trailing identifier (`"foo"`)
649/// - `qualified_path` = `Some("a::b::foo")` for disambiguation in `resolve_calls`.
650///
651/// For method calls (`x.method()`), `receiver_type` is inferred from local
652/// context (parameter annotations, let-bindings, impl blocks). See
653/// [`infer_receiver_types`] for the heuristic.
654fn extract_calls(source: &str, call_config: &languages::CallConfig, defs: &mut [Definition]) {
655 let mut parser = Parser::new();
656 if parser.set_language(&call_config.language).is_err() {
657 return;
658 }
659 let Some(tree) = parser.parse(source, None) else {
660 return;
661 };
662
663 // Build receiver-type map: byte_offset_of_call → receiver_type_string.
664 // Done once per file to amortise the tree walk cost.
665 let receiver_map = infer_receiver_types(source, &tree, &call_config.language);
666
667 // HCL: run the HCL-specific call-edge extractor as a post-pass so the
668 // terraform_remote_state references and module blocks contribute edges
669 // that the generic function_call query cannot capture (R2 + R3, Wave 3).
670 if languages::is_hcl_language(&call_config.language) {
671 extract_hcl_call_edges(source, tree.root_node(), defs);
672 }
673
674 let mut cursor = QueryCursor::new();
675 let mut matches = cursor.matches(&call_config.query, tree.root_node(), source.as_bytes());
676
677 while let Some(m) = matches.next() {
678 let mut full_callee_text = None;
679 let mut call_byte = 0u32;
680
681 for cap in m.captures {
682 let cap_name = &call_config.query.capture_names()[cap.index as usize];
683 if *cap_name == "callee" {
684 full_callee_text =
685 Some(source[cap.node.start_byte()..cap.node.end_byte()].to_string());
686 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
687 {
688 call_byte = cap.node.start_byte() as u32;
689 }
690 }
691 }
692
693 if let Some(full_text) = full_callee_text {
694 // Split qualified path into bare name + optional qualifier.
695 let (name, qualified_path) = if full_text.contains("::") {
696 let bare = full_text
697 .rsplit("::")
698 .next()
699 .unwrap_or(&full_text)
700 .to_string();
701 (bare, Some(full_text))
702 } else {
703 (full_text, None)
704 };
705
706 // Look up receiver type from the pre-built map.
707 let receiver_type = receiver_map.get(&call_byte).cloned();
708
709 // Assign to the most-specific (smallest byte range) enclosing definition.
710 // Using `find` (first match) was incorrect for nested defs: an `impl_item`
711 // wrapping a `function_item` both contain the call site, but the
712 // `function_item` is the correct granularity for method attribution.
713 //
714 // Tiebreak: when two defs have equal byte spans (as happens in Python where
715 // the class body `block` and its first `function_definition` share the same
716 // start/end bytes), prefer function-like defs over structural container defs.
717 // `is_callable_def` returns 0 for function-like kinds (sorts first in min_by_key).
718 let enclosing_idx = defs
719 .iter()
720 .enumerate()
721 .filter(|(_, d)| d.start_byte <= call_byte && call_byte < d.end_byte)
722 .min_by_key(|(_, d)| (d.end_byte - d.start_byte, is_callable_def_priority(&d.kind)))
723 .map(|(i, _)| i);
724
725 if let Some(idx) = enclosing_idx {
726 // Skip self-recursive calls (compare bare name to def name).
727 if defs[idx].name != name {
728 defs[idx].calls.push(CallRef {
729 name,
730 qualified_path,
731 receiver_type,
732 byte_offset: call_byte,
733 resolved: None,
734 });
735 }
736 }
737 // Calls outside any definition are ignored (module-level init).
738 }
739 }
740}
741
742// HCL: post-pass call-edge extraction for terraform_remote_state and module
743// blocks. These are not function calls — they are HCL-specific structural
744// references to other Terraform modules — so the generic
745// `(function_call (identifier) @callee) @call` pattern in
746// `call_query_for_extension("tf")` cannot capture them. This helper runs
747// once per HCL file inside `extract_calls` (R2 + R3, Wave 3).
748
749/// Walk an HCL parse tree and emit CallRef entries for:
750///
751/// 1. `data.terraform_remote_state.<NAME>.outputs.<ATTR>` expressions:
752/// one CallRef per reference, with `name = NAME` and
753/// `qualified_path = Some("terraform_remote_state.NAME")`. These
754/// connect the current file to the named remote-state module's outputs.
755///
756/// 2. `module "X" { source = "../X" }` blocks: one CallRef with
757/// `name = X` (the label) and `qualified_path = Some("module.X")`.
758/// The module reference connects to the module's directory in
759/// `resolve_import` (HCL module-source resolution is not implemented
760/// yet — the qualified_path carrier is the contract; resolve adds the
761/// file lookup).
762///
763/// Each emitted CallRef is attached to the smallest enclosing definition
764/// by byte range — matching the same heuristic used by `extract_calls`.
765fn extract_hcl_call_edges(source: &str, root: tree_sitter::Node<'_>, defs: &mut [Definition]) {
766 // Walk all named descendants iteratively.
767 let mut stack: Vec<tree_sitter::Node<'_>> = vec![root];
768 while let Some(node) = stack.pop() {
769 // Defer to a function-style helper per node kind.
770 match node.kind() {
771 "expression" => hcl_visit_expression(source, node, defs),
772 "block" => hcl_visit_block(source, node, defs),
773 _ => {}
774 }
775 // Recurse into named children.
776 let mut cursor = node.walk();
777 for child in node.children(&mut cursor) {
778 if child.is_named() {
779 stack.push(child);
780 }
781 }
782 }
783}
784
785/// Inspect an HCL `expression` node for the
786/// `data.terraform_remote_state.<NAME>.outputs.<ATTR>` reference pattern.
787///
788/// The expression tree looks like:
789/// ```text
790/// expression
791/// variable_expr
792/// identifier "data"
793/// get_attr
794/// identifier "terraform_remote_state"
795/// get_attr
796/// identifier "<NAME>"
797/// get_attr
798/// identifier "outputs"
799/// get_attr
800/// identifier "<ATTR>"
801/// ```
802fn hcl_visit_expression(source: &str, node: tree_sitter::Node<'_>, defs: &mut [Definition]) {
803 // Collect children: must be `variable_expr` (with identifier="data")
804 // followed by a chain of `get_attr` nodes (each with an `identifier` child).
805 let mut cursor = node.walk();
806 let mut child_iter = node.children(&mut cursor);
807 let Some(first) = child_iter.next() else {
808 return;
809 };
810 if first.kind() != "variable_expr" {
811 return;
812 }
813 let Some(first_id) = first.child_by_field_name("name").or_else(|| {
814 // Fallback: find first named child that's an identifier.
815 let mut c = first.walk();
816 first.children(&mut c).find(|n| n.kind() == "identifier")
817 }) else {
818 return;
819 };
820 if &source[first_id.start_byte()..first_id.end_byte()] != "data" {
821 return;
822 }
823
824 // Collect identifiers from the chain of get_attr.
825 let mut chain: Vec<String> = Vec::new();
826 for child in child_iter {
827 if child.kind() != "get_attr" {
828 return; // not a pure attribute chain
829 }
830 let mut gc = child.walk();
831 let id = child.children(&mut gc).find(|n| n.kind() == "identifier");
832 let Some(id_node) = id else { return };
833 chain.push(source[id_node.start_byte()..id_node.end_byte()].to_string());
834 }
835
836 // Expect: terraform_remote_state, <NAME>, outputs, <ATTR>
837 if chain.len() < 2 || chain[0] != "terraform_remote_state" {
838 return;
839 }
840 let name = chain[1].clone();
841 let qualified_path = format!("terraform_remote_state.{name}");
842
843 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
844 let call_byte = node.start_byte() as u32;
845 attach_hcl_call(defs, call_byte, name, Some(qualified_path));
846}
847
848/// Inspect an HCL `block` node for the `module "X" { source = "../X" }`
849/// pattern. Emits one CallRef per matching block.
850fn hcl_visit_block(source: &str, node: tree_sitter::Node<'_>, defs: &mut [Definition]) {
851 // First child must be identifier="module".
852 let mut cursor = node.walk();
853 let children: Vec<tree_sitter::Node<'_>> = node.children(&mut cursor).collect();
854 let Some(first) = children.first() else {
855 return;
856 };
857 if first.kind() != "identifier" || &source[first.start_byte()..first.end_byte()] != "module" {
858 return;
859 }
860 // Next child should be a string_lit (the module label).
861 let label_node = children.iter().find(|c| c.kind() == "string_lit");
862 let Some(label_node) = label_node else {
863 return;
864 };
865 let mut lc = label_node.walk();
866 let template = label_node
867 .children(&mut lc)
868 .find(|n| n.kind() == "template_literal");
869 let Some(template) = template else {
870 return;
871 };
872 let label = source[template.start_byte()..template.end_byte()].to_string();
873 let qualified_path = format!("module.{label}");
874
875 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
876 let call_byte = node.start_byte() as u32;
877 attach_hcl_call(defs, call_byte, label, Some(qualified_path));
878}
879
880/// Attach a synthesized HCL CallRef to the smallest enclosing definition.
881/// Mirrors the byte-range attribution from `extract_calls`.
882fn attach_hcl_call(
883 defs: &mut [Definition],
884 call_byte: u32,
885 name: String,
886 qualified_path: Option<String>,
887) {
888 let enclosing_idx = defs
889 .iter()
890 .enumerate()
891 .filter(|(_, d)| d.start_byte <= call_byte && call_byte < d.end_byte)
892 .min_by_key(|(_, d)| (d.end_byte - d.start_byte, is_callable_def_priority(&d.kind)))
893 .map(|(i, _)| i);
894 if let Some(idx) = enclosing_idx {
895 // Skip self-recursive emission (would happen if the enclosing def
896 // happens to share the same `name` as the synthesized callee).
897 if defs[idx].name != name {
898 defs[idx].calls.push(CallRef {
899 name,
900 qualified_path,
901 receiver_type: None,
902 byte_offset: call_byte,
903 resolved: None,
904 });
905 }
906 }
907}
908
909/// Infer method-call receiver types from local context within a parse tree.
910///
911/// Returns a map from `byte_offset_of_@callee_capture` to a receiver type string.
912///
913/// Dispatches to a language-specific collector:
914///
915/// - **Rust**: [`collect_rust_receiver_types`] — three heuristic cases:
916/// 1. `self.method()` inside `impl Foo { … }` → `"Foo"`.
917/// 2. `x.method()` where `x: Bar` is a function parameter → `"Bar"`.
918/// 3. `x.method()` after `let x = Foo::new()` → `"Foo"`.
919///
920/// - **Python**: [`collect_python_receiver_types`] — two heuristic cases:
921/// 1. `self.method()` inside a class method → class name from enclosing
922/// `class_definition`.
923/// 2. `instance.method()` where `instance: ClassName` type annotation or
924/// `instance = ClassName(...)` assignment is visible in the same scope.
925///
926/// - **Go**: [`collect_go_receiver_types`] — one heuristic case:
927/// 1. `recv.Method()` inside a `method_declaration` where `recv` is the
928/// named receiver parameter → receiver type from the method signature.
929///
930/// This is heuristic, not type-inference-complete. Unknown/ambiguous cases
931/// produce no entry in the map; `extract_calls` leaves those `receiver_type = None`.
932fn infer_receiver_types(
933 source: &str,
934 tree: &tree_sitter::Tree,
935 language: &tree_sitter::Language,
936) -> HashMap<u32, String> {
937 let mut map: HashMap<u32, String> = HashMap::new();
938
939 if languages::is_rust_language(language) {
940 collect_rust_receiver_types(source, tree.root_node(), &mut map);
941 } else if languages::is_python_language(language) {
942 collect_python_receiver_types(source, tree.root_node(), &mut map);
943 } else if languages::is_go_language(language) {
944 collect_go_receiver_types(source, tree.root_node(), &mut map);
945 }
946 // Other languages: no receiver inference — leave map empty.
947
948 map
949}
950
951/// Walk the Rust parse tree and fill `map` with receiver-type inference.
952///
953/// This is a recursive descent that tracks:
954/// - The current `impl Foo` or `impl Foo for Bar` type name (for `self.*` calls).
955/// - Parameter type annotations (for `x: SomeType` → `x` has type `SomeType`).
956/// - Constructor let-bindings (`let x = Foo::new()` → `x` has type `Foo`).
957fn collect_rust_receiver_types(
958 source: &str,
959 node: tree_sitter::Node<'_>,
960 map: &mut HashMap<u32, String>,
961) {
962 // We use a stack-based walk to avoid deep recursion on large files.
963 // Each stack entry carries (node, impl_type_context).
964 let mut stack: Vec<(tree_sitter::Node<'_>, Option<String>)> = vec![(node, None)];
965
966 while let Some((n, impl_ctx)) = stack.pop() {
967 match n.kind() {
968 "impl_item" => {
969 // Extract `impl Foo` or `impl Trait for Foo` → capture the `for` type.
970 // tree-sitter-rust shape: `(impl_item type: (type_identifier) @type)` for
971 // inherent impls, and `(impl_item trait: … type: (type_identifier) @type)`
972 // for trait impls. Both have a child named `type`.
973 let impl_type = extract_impl_self_type(source, n);
974 let new_ctx = impl_type.or_else(|| impl_ctx.clone());
975 let mut cursor = n.walk();
976 for child in n.children(&mut cursor) {
977 stack.push((child, new_ctx.clone()));
978 }
979 }
980 "function_item" => {
981 // Build parameter bindings: (param_name → type_name).
982 let param_types = extract_param_types(source, n);
983 // Build let-binding type map from constructor calls.
984 let let_types = extract_let_binding_types(source, n);
985 // Annotate call sites within this function body.
986 annotate_method_calls(
987 source,
988 n,
989 impl_ctx.as_deref(),
990 ¶m_types,
991 &let_types,
992 map,
993 );
994 // Do NOT recurse into function_item children with the outer stack —
995 // function bodies are fully handled by annotate_method_calls.
996 // (Nested fn items would re-enter via their own impl_item context.)
997 // Push children with same impl_ctx so nested impl blocks are found.
998 let mut cursor = n.walk();
999 for child in n.children(&mut cursor) {
1000 stack.push((child, impl_ctx.clone()));
1001 }
1002 }
1003 _ => {
1004 let mut cursor = n.walk();
1005 for child in n.children(&mut cursor) {
1006 stack.push((child, impl_ctx.clone()));
1007 }
1008 }
1009 }
1010 }
1011}
1012
1013/// Extract the self type from an `impl_item` node.
1014///
1015/// For `impl Foo { … }` → `Some("Foo")`.
1016/// For `impl Trait for Foo { … }` → `Some("Foo")` (the concrete `for` type).
1017fn extract_impl_self_type(source: &str, impl_node: tree_sitter::Node<'_>) -> Option<String> {
1018 // tree-sitter-rust: impl_item has a field named "type" for the self type.
1019 // For `impl Foo for Bar { }`, "type" is Bar; for `impl Foo { }`, "type" is Foo.
1020 let type_node = impl_node.child_by_field_name("type")?;
1021 Some(source[type_node.start_byte()..type_node.end_byte()].to_string())
1022}
1023
1024/// Extract parameter name → type mappings from a function signature.
1025///
1026/// Handles `fn foo(x: Bar, y: Baz)` → `{"x": "Bar", "y": "Baz"}`.
1027/// The `self`/`&self`/`&mut self` parameter is skipped (handled via impl_ctx).
1028fn extract_param_types(source: &str, fn_node: tree_sitter::Node<'_>) -> HashMap<String, String> {
1029 let mut params: HashMap<String, String> = HashMap::new();
1030 let Some(params_node) = fn_node.child_by_field_name("parameters") else {
1031 return params;
1032 };
1033 let mut cursor = params_node.walk();
1034 for param in params_node.children(&mut cursor) {
1035 if param.kind() == "parameter" {
1036 // parameter has children: pattern (identifier) and type
1037 let mut param_name = None;
1038 let mut param_type = None;
1039 let mut pc = param.walk();
1040 for child in param.children(&mut pc) {
1041 match child.kind() {
1042 "identifier" | "mutable_specifier" if param_name.is_none() => {
1043 let text = source[child.start_byte()..child.end_byte()].to_string();
1044 if text != "mut" {
1045 param_name = Some(text);
1046 }
1047 }
1048 "type_identifier"
1049 | "generic_type"
1050 | "reference_type"
1051 | "scoped_type_identifier"
1052 if param_type.is_none() =>
1053 {
1054 // Extract the base type identifier from potentially complex types.
1055 param_type = Some(extract_base_type(source, child));
1056 }
1057 _ => {}
1058 }
1059 }
1060 if let (Some(name), Some(ty)) = (param_name, param_type)
1061 && !ty.is_empty()
1062 {
1063 params.insert(name, ty);
1064 }
1065 }
1066 // Also handle typed_pattern in newer grammars
1067 if param.kind() == "typed_pattern" {
1068 let mut name_part = None;
1069 let mut type_part = None;
1070 let mut pc = param.walk();
1071 for child in param.children(&mut pc) {
1072 if child.kind() == "identifier" && name_part.is_none() {
1073 name_part = Some(source[child.start_byte()..child.end_byte()].to_string());
1074 } else if matches!(
1075 child.kind(),
1076 "type_identifier"
1077 | "generic_type"
1078 | "reference_type"
1079 | "scoped_type_identifier"
1080 ) && type_part.is_none()
1081 {
1082 type_part = Some(extract_base_type(source, child));
1083 }
1084 }
1085 if let (Some(name), Some(ty)) = (name_part, type_part)
1086 && !ty.is_empty()
1087 {
1088 params.insert(name, ty);
1089 }
1090 }
1091 }
1092 params
1093}
1094
1095/// Extract the base `TypeIdentifier` from a potentially complex type node.
1096///
1097/// For `Bar`, `&Bar`, `&mut Bar`, `Bar<T>` → returns `"Bar"`.
1098/// For `module::Bar` → returns `"Bar"` (bare name for matching).
1099fn extract_base_type(source: &str, node: tree_sitter::Node<'_>) -> String {
1100 match node.kind() {
1101 "type_identifier" => source[node.start_byte()..node.end_byte()].to_string(),
1102 "generic_type" | "reference_type" | "mutable_specifier" | "scoped_type_identifier" => {
1103 // Recurse to find the innermost type_identifier
1104 let mut cursor = node.walk();
1105 for child in node.children(&mut cursor) {
1106 let t = extract_base_type(source, child);
1107 if !t.is_empty() {
1108 return t;
1109 }
1110 }
1111 String::new()
1112 }
1113 _ => {
1114 // For other nodes, try children
1115 let mut cursor = node.walk();
1116 for child in node.children(&mut cursor) {
1117 if child.kind() == "type_identifier" {
1118 return source[child.start_byte()..child.end_byte()].to_string();
1119 }
1120 }
1121 String::new()
1122 }
1123 }
1124}
1125
1126/// Scan a function body for `let x = Foo::new()` patterns.
1127///
1128/// Returns a map from local variable name to the constructor type name.
1129/// E.g., `let x = Foo::new();` → `{"x": "Foo"}`.
1130fn extract_let_binding_types(
1131 source: &str,
1132 fn_node: tree_sitter::Node<'_>,
1133) -> HashMap<String, String> {
1134 let mut bindings: HashMap<String, String> = HashMap::new();
1135
1136 let Some(body) = fn_node.child_by_field_name("body") else {
1137 return bindings;
1138 };
1139
1140 // Walk the function body looking for let_declaration nodes.
1141 let mut stack = vec![body];
1142 while let Some(n) = stack.pop() {
1143 if n.kind() == "let_declaration" {
1144 // let_declaration: pattern (identifier) + value (call_expression or …)
1145 let mut binding_name = None;
1146 let mut constructor_type = None;
1147 let mut cursor = n.walk();
1148 for child in n.children(&mut cursor) {
1149 match child.kind() {
1150 "identifier" if binding_name.is_none() => {
1151 binding_name =
1152 Some(source[child.start_byte()..child.end_byte()].to_string());
1153 }
1154 "call_expression" => {
1155 // Look for `Foo::new()` or `Foo::from(…)` patterns.
1156 // The function child of call_expression is a scoped_identifier.
1157 if let Some(func) = child.child_by_field_name("function")
1158 && func.kind() == "scoped_identifier"
1159 {
1160 // scoped_identifier path: `Foo::new` — extract head segment.
1161 let full = source[func.start_byte()..func.end_byte()].to_string();
1162 let head = full.split("::").next().unwrap_or("").to_string();
1163 if !head.is_empty()
1164 && head.chars().next().is_some_and(char::is_uppercase)
1165 {
1166 constructor_type = Some(head);
1167 }
1168 }
1169 }
1170 _ => {}
1171 }
1172 }
1173 if let (Some(name), Some(ty)) = (binding_name, constructor_type) {
1174 bindings.insert(name, ty);
1175 }
1176 }
1177 // Push children for recursive walk.
1178 let mut cursor = n.walk();
1179 for child in n.children(&mut cursor) {
1180 stack.push(child);
1181 }
1182 }
1183
1184 bindings
1185}
1186
1187/// Walk a function body and annotate method-call byte offsets with receiver types.
1188///
1189/// A "method call" in the ripvec call query is:
1190/// `(call_expression function: (field_expression field: (field_identifier) @callee))`
1191///
1192/// The receiver is the `value` child of `field_expression`. This function
1193/// checks whether the receiver is:
1194/// - `self` → use `impl_ctx` type.
1195/// - An identifier matching a parameter type in `param_types`.
1196/// - An identifier matching a constructor let-binding in `let_types`.
1197fn annotate_method_calls(
1198 source: &str,
1199 fn_node: tree_sitter::Node<'_>,
1200 impl_ctx: Option<&str>,
1201 param_types: &HashMap<String, String>,
1202 let_types: &HashMap<String, String>,
1203 map: &mut HashMap<u32, String>,
1204) {
1205 // Walk the entire function (including its body) looking for call_expression nodes.
1206 let mut stack = vec![fn_node];
1207 while let Some(n) = stack.pop() {
1208 if n.kind() == "call_expression"
1209 && let Some(func) = n.child_by_field_name("function")
1210 && func.kind() == "field_expression"
1211 {
1212 // field_expression: value (receiver) + field (method name identifier)
1213 if let (Some(recv), Some(field)) = (
1214 func.child_by_field_name("value"),
1215 func.child_by_field_name("field"),
1216 ) {
1217 let recv_text = source[recv.start_byte()..recv.end_byte()].to_string();
1218 let receiver_type = if recv_text == "self" || recv_text == "*self" {
1219 impl_ctx.map(str::to_owned)
1220 } else {
1221 // Strip ref sigils for lookup.
1222 let base = recv_text
1223 .trim_start_matches('*')
1224 .trim_start_matches('&')
1225 .trim();
1226 param_types
1227 .get(base)
1228 .or_else(|| let_types.get(base))
1229 .cloned()
1230 };
1231
1232 if let Some(ty) = receiver_type {
1233 // The `@callee` capture byte offset is the start of the field node.
1234 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
1235 let field_byte = field.start_byte() as u32;
1236 map.insert(field_byte, ty);
1237 }
1238 }
1239 }
1240 let mut cursor = n.walk();
1241 for child in n.children(&mut cursor) {
1242 stack.push(child);
1243 }
1244 }
1245}
1246
1247// ── Python receiver-type heuristic ───────────────────────────────────
1248
1249/// Walk the Python parse tree and fill `map` with receiver-type inference.
1250///
1251/// Two heuristic cases:
1252///
1253/// 1. **`self.method()` inside a method** — when the `attribute` call receiver is
1254/// the literal text `self`, the receiver type is the name of the nearest
1255/// enclosing `class_definition`.
1256///
1257/// 2. **`instance.method()` with a type annotation or constructor call** —
1258/// when a function parameter has a PEP 484 annotation `param: ClassName` or
1259/// when a local assignment `param = ClassName(...)` precedes the call, the
1260/// receiver type is bound to `ClassName`.
1261///
1262/// The Python call query captures:
1263/// - `(call function: (attribute attribute: (identifier) @callee)) @call`
1264///
1265/// Within the `attribute` node, `value` is the receiver expression and
1266/// `attribute` is the method name (the `@callee` capture). The `@callee`
1267/// byte offset is the start of the `attribute` child identifier node.
1268fn collect_python_receiver_types(
1269 source: &str,
1270 root: tree_sitter::Node<'_>,
1271 map: &mut HashMap<u32, String>,
1272) {
1273 // Stack carries (node, class_ctx: Option<String>).
1274 // class_ctx is the name of the nearest enclosing class_definition.
1275 let mut stack: Vec<(tree_sitter::Node<'_>, Option<String>)> = vec![(root, None)];
1276
1277 while let Some((n, class_ctx)) = stack.pop() {
1278 match n.kind() {
1279 "class_definition" => {
1280 // Extract the class name from the `name` child.
1281 let class_name = n
1282 .child_by_field_name("name")
1283 .map(|c| source[c.start_byte()..c.end_byte()].to_string());
1284 let new_ctx = class_name.or_else(|| class_ctx.clone());
1285 let mut cursor = n.walk();
1286 for child in n.children(&mut cursor) {
1287 stack.push((child, new_ctx.clone()));
1288 }
1289 }
1290 "function_definition" => {
1291 // Build parameter annotation map: param_name → type_name.
1292 let param_types = extract_python_param_types(source, n);
1293 // Build local assignment map: var_name → constructor type.
1294 let let_types = extract_python_assignment_types(source, n);
1295 // Annotate attribute call sites within this function body.
1296 annotate_python_method_calls(
1297 source,
1298 n,
1299 class_ctx.as_deref(),
1300 ¶m_types,
1301 &let_types,
1302 map,
1303 );
1304 // Push children with same class_ctx so nested classes are found.
1305 let mut cursor = n.walk();
1306 for child in n.children(&mut cursor) {
1307 stack.push((child, class_ctx.clone()));
1308 }
1309 }
1310 _ => {
1311 let mut cursor = n.walk();
1312 for child in n.children(&mut cursor) {
1313 stack.push((child, class_ctx.clone()));
1314 }
1315 }
1316 }
1317 }
1318}
1319
1320/// Extract Python parameter name → type annotation mappings.
1321///
1322/// Handles PEP 484 style: `def foo(self, x: Bar, y: Baz) -> ...`.
1323/// The `self` parameter is excluded (handled via class_ctx).
1324/// Returns `{"x": "Bar", "y": "Baz"}`.
1325fn extract_python_param_types(
1326 source: &str,
1327 fn_node: tree_sitter::Node<'_>,
1328) -> HashMap<String, String> {
1329 let mut params: HashMap<String, String> = HashMap::new();
1330 let Some(params_node) = fn_node.child_by_field_name("parameters") else {
1331 return params;
1332 };
1333
1334 // Parameters node children include `identifier`, `typed_parameter`,
1335 // `typed_default_parameter`, and others.
1336 let mut cursor = params_node.walk();
1337 for param in params_node.children(&mut cursor) {
1338 match param.kind() {
1339 "typed_parameter" => {
1340 // (typed_parameter (identifier) @name type: (type) @type)
1341 // First identifier child is the name; type child is the type.
1342 let mut name_text = None;
1343 let mut type_text = None;
1344 let mut pc = param.walk();
1345 for child in param.children(&mut pc) {
1346 match child.kind() {
1347 "identifier" if name_text.is_none() => {
1348 let t = source[child.start_byte()..child.end_byte()].to_string();
1349 if t != "self" && t != "cls" {
1350 name_text = Some(t);
1351 }
1352 }
1353 "type" | "identifier" | "attribute"
1354 if type_text.is_none() && name_text.is_some() =>
1355 {
1356 // The type child in tree-sitter-python is a `type` node
1357 // whose text is the annotation expression. Extract the
1358 // base identifier (handle `Optional[Bar]`, `List[Bar]`, etc.)
1359 type_text = Some(extract_python_base_type(source, child));
1360 }
1361 _ => {}
1362 }
1363 }
1364 if let (Some(name), Some(ty)) = (name_text, type_text)
1365 && !ty.is_empty()
1366 && !ty.eq("self")
1367 && !ty.eq("cls")
1368 {
1369 params.insert(name, ty);
1370 }
1371 }
1372 "typed_default_parameter" => {
1373 // (typed_default_parameter name: (identifier) type: (type) value: …)
1374 let name_node = param.child_by_field_name("name");
1375 let type_node = param.child_by_field_name("type");
1376 if let (Some(nn), Some(tn)) = (name_node, type_node) {
1377 let name = source[nn.start_byte()..nn.end_byte()].to_string();
1378 if name != "self" && name != "cls" {
1379 let ty = extract_python_base_type(source, tn);
1380 if !ty.is_empty() {
1381 params.insert(name, ty);
1382 }
1383 }
1384 }
1385 }
1386 _ => {}
1387 }
1388 }
1389 params
1390}
1391
1392/// Extract the base type name from a Python type annotation node.
1393///
1394/// For `Bar` → `"Bar"`. For `Optional[Bar]` or `List[Bar]` → `"Bar"`.
1395/// For `module.Class` → `"Class"` (bare name only).
1396fn extract_python_base_type(source: &str, node: tree_sitter::Node<'_>) -> String {
1397 match node.kind() {
1398 "identifier" => source[node.start_byte()..node.end_byte()].to_string(),
1399 // tree-sitter-python wraps annotations in a `type` node
1400 "type" => {
1401 let mut cursor = node.walk();
1402 for child in node.children(&mut cursor) {
1403 let t = extract_python_base_type(source, child);
1404 if !t.is_empty() {
1405 return t;
1406 }
1407 }
1408 String::new()
1409 }
1410 // Generic alias: `Optional[Bar]` — the first identifier child is `Optional`,
1411 // the subscript child contains `Bar`. We want the subscript content.
1412 "subscript" => {
1413 // subscript has value (e.g. Optional) and subscript (e.g. Bar).
1414 // Return the subscript's base type (the inner type argument).
1415 if let Some(sub) = node.child_by_field_name("subscript") {
1416 return extract_python_base_type(source, sub);
1417 }
1418 // Fall back: first identifier
1419 let mut cursor = node.walk();
1420 for child in node.children(&mut cursor) {
1421 if child.kind() == "identifier" {
1422 return source[child.start_byte()..child.end_byte()].to_string();
1423 }
1424 }
1425 String::new()
1426 }
1427 // Attribute node `module.Class` → take last identifier
1428 "attribute" => {
1429 if let Some(attr) = node.child_by_field_name("attribute") {
1430 return source[attr.start_byte()..attr.end_byte()].to_string();
1431 }
1432 String::new()
1433 }
1434 _ => {
1435 // Try first identifier child
1436 let mut cursor = node.walk();
1437 for child in node.children(&mut cursor) {
1438 if child.kind() == "identifier" {
1439 return source[child.start_byte()..child.end_byte()].to_string();
1440 }
1441 }
1442 String::new()
1443 }
1444 }
1445}
1446
1447/// Scan a Python function body for `x = ClassName(...)` assignment patterns.
1448///
1449/// Returns a map from local variable name to constructor type.
1450/// E.g., `x = Foo()` → `{"x": "Foo"}`.
1451/// Also handles `x = module.ClassName(...)` → `{"x": "ClassName"}`.
1452fn extract_python_assignment_types(
1453 source: &str,
1454 fn_node: tree_sitter::Node<'_>,
1455) -> HashMap<String, String> {
1456 let mut bindings: HashMap<String, String> = HashMap::new();
1457 let Some(body) = fn_node.child_by_field_name("body") else {
1458 return bindings;
1459 };
1460
1461 let mut stack = vec![body];
1462 while let Some(n) = stack.pop() {
1463 if n.kind() == "assignment" {
1464 // assignment: left = right
1465 // We want: left is a simple identifier, right is a call whose
1466 // function is an identifier starting with an uppercase letter
1467 // (Python convention for class names).
1468 let left = n.child_by_field_name("left");
1469 let right = n.child_by_field_name("right");
1470 if let (Some(lhs), Some(rhs)) = (left, right)
1471 && lhs.kind() == "identifier"
1472 && rhs.kind() == "call"
1473 && let Some(func) = rhs.child_by_field_name("function")
1474 {
1475 let var_name = source[lhs.start_byte()..lhs.end_byte()].to_string();
1476 let constructor_type = match func.kind() {
1477 "identifier" => {
1478 let t = source[func.start_byte()..func.end_byte()].to_string();
1479 // Class names are conventionally uppercase-first
1480 if t.chars().next().is_some_and(char::is_uppercase) {
1481 Some(t)
1482 } else {
1483 None
1484 }
1485 }
1486 "attribute" => {
1487 // `module.ClassName(...)` — take the `attribute` part
1488 func.child_by_field_name("attribute")
1489 .map(|a| source[a.start_byte()..a.end_byte()].to_string())
1490 }
1491 _ => None,
1492 };
1493 if let Some(ty) = constructor_type {
1494 bindings.insert(var_name, ty);
1495 }
1496 }
1497 }
1498 let mut cursor = n.walk();
1499 for child in n.children(&mut cursor) {
1500 stack.push(child);
1501 }
1502 }
1503 bindings
1504}
1505
1506/// Walk a Python function body and annotate attribute-call byte offsets.
1507///
1508/// A Python method call is:
1509/// `(call function: (attribute value: <receiver> attribute: (identifier) @callee))`
1510///
1511/// The receiver is the `value` child of `attribute`. This function checks:
1512/// - `self` → use the enclosing class name (`class_ctx`).
1513/// - An identifier matching a parameter type in `param_types`.
1514/// - An identifier matching a constructor assignment in `let_types`.
1515fn annotate_python_method_calls(
1516 source: &str,
1517 fn_node: tree_sitter::Node<'_>,
1518 class_ctx: Option<&str>,
1519 param_types: &HashMap<String, String>,
1520 let_types: &HashMap<String, String>,
1521 map: &mut HashMap<u32, String>,
1522) {
1523 let mut stack = vec![fn_node];
1524 while let Some(n) = stack.pop() {
1525 if n.kind() == "call"
1526 && let Some(func) = n.child_by_field_name("function")
1527 && func.kind() == "attribute"
1528 && let (Some(recv_node), Some(attr_node)) = (
1529 func.child_by_field_name("object"),
1530 func.child_by_field_name("attribute"),
1531 )
1532 {
1533 // attribute node: object (receiver) + attribute (method name)
1534 let recv_text = source[recv_node.start_byte()..recv_node.end_byte()].to_string();
1535 let receiver_type = if recv_text == "self" || recv_text == "cls" {
1536 class_ctx.map(str::to_owned)
1537 } else if recv_node.kind() == "identifier" {
1538 param_types
1539 .get(&recv_text)
1540 .or_else(|| let_types.get(&recv_text))
1541 .cloned()
1542 } else {
1543 None
1544 };
1545
1546 if let Some(ty) = receiver_type {
1547 // The `@callee` capture byte offset is the `attribute` child.
1548 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
1549 let attr_byte = attr_node.start_byte() as u32;
1550 map.insert(attr_byte, ty);
1551 }
1552 }
1553 let mut cursor = n.walk();
1554 for child in n.children(&mut cursor) {
1555 stack.push(child);
1556 }
1557 }
1558}
1559
1560// ── Python class hierarchy (MRO) extraction ───────────────────────────
1561
1562/// Walk the Python parse tree and add class → parent-names entries to `out`.
1563///
1564/// tree-sitter-python shape:
1565/// ```text
1566/// (class_definition
1567/// name: (identifier) @child
1568/// superclasses: (argument_list
1569/// (identifier) @parent ; bare parent: class Foo(Bar):
1570/// (attribute attribute: (identifier) @parent) ; qualified: class Foo(mod.Bar):
1571/// (keyword_argument …) ; ignored: class Foo(Bar, metaclass=Meta):
1572/// )?
1573/// ...)
1574/// ```
1575///
1576/// Each map entry's key is a class defined in the file; the value is the
1577/// ordered list of declared parent class **names** (the trailing `attribute`
1578/// segment, so `mod.Bar` becomes `Bar`). Classes with no declared parents
1579/// still get an entry (empty `Vec`) so a downstream MRO walk can tell
1580/// "known class with no parents" from "unknown class".
1581fn extract_python_class_hierarchy_node(
1582 source: &str,
1583 root: tree_sitter::Node<'_>,
1584 out: &mut HashMap<String, Vec<String>>,
1585) {
1586 let mut stack = vec![root];
1587 while let Some(n) = stack.pop() {
1588 if n.kind() == "class_definition"
1589 && let Some(name_node) = n.child_by_field_name("name")
1590 {
1591 let class_name = source[name_node.start_byte()..name_node.end_byte()].to_string();
1592 let mut parents: Vec<String> = Vec::new();
1593 if let Some(superclasses) = n.child_by_field_name("superclasses") {
1594 // superclasses is an `argument_list`; iterate its children and
1595 // collect identifiers / attribute trailing-names. Skip
1596 // keyword_argument entries (metaclass=…, etc.) and punctuation.
1597 let mut sc = superclasses.walk();
1598 for child in superclasses.children(&mut sc) {
1599 match child.kind() {
1600 "identifier" => {
1601 let t = source[child.start_byte()..child.end_byte()].to_string();
1602 parents.push(t);
1603 }
1604 "attribute" => {
1605 // module.Cls → take the trailing `attribute` segment.
1606 if let Some(attr) = child.child_by_field_name("attribute") {
1607 parents
1608 .push(source[attr.start_byte()..attr.end_byte()].to_string());
1609 }
1610 }
1611 // Drop keyword_argument, "(", ")", ",", comments, etc.
1612 _ => {}
1613 }
1614 }
1615 }
1616 out.insert(class_name, parents);
1617 }
1618 let mut cursor = n.walk();
1619 for child in n.children(&mut cursor) {
1620 stack.push(child);
1621 }
1622 }
1623}
1624
1625/// Extract the Python `class → [parents]` map from a single source file by
1626/// parsing it with tree-sitter-python.
1627///
1628/// Returns an empty map when the source fails to parse or contains no
1629/// `class_definition` nodes. The returned map is the per-file contribution
1630/// to the global hierarchy used by [`resolve_calls_with_python_mro_pub`]
1631/// for MRO-aware receiver-type dispatch (Q1, Wave 2).
1632#[must_use]
1633pub fn extract_python_class_hierarchy(source: &str) -> HashMap<String, Vec<String>> {
1634 let mut parser = Parser::new();
1635 let lang: tree_sitter::Language = tree_sitter_python::LANGUAGE.into();
1636 if parser.set_language(&lang).is_err() {
1637 return HashMap::new();
1638 }
1639 let Some(tree) = parser.parse(source, None) else {
1640 return HashMap::new();
1641 };
1642 let mut out: HashMap<String, Vec<String>> = HashMap::new();
1643 extract_python_class_hierarchy_node(source, tree.root_node(), &mut out);
1644 out
1645}
1646
1647/// Compute the linearised MRO (Method Resolution Order) for a Python class
1648/// name using a **simplified left-first depth-first walk** of the declared
1649/// `class → [parents]` hierarchy.
1650///
1651/// Python's real MRO uses C3 linearisation, which is monotonic and respects
1652/// declaration order across the diamond inheritance shape. For ripvec's
1653/// reverse-call-graph purpose we want *any plausible ancestor* of the
1654/// receiver type — including ancestors only reachable via a mixin — so we
1655/// can resolve `self.method()` calls whose dispatch lands on an ancestor.
1656/// The simplification: pre-order DFS, left-to-right, skipping cycles via a
1657/// `visited` set.
1658///
1659/// On a non-diamond shape this matches C3 exactly. On a diamond the
1660/// simplified walk may surface an ancestor earlier than C3 would, but every
1661/// ancestor C3 would visit is still reached — and ripvec's goal is "find
1662/// the implementing def for an inherited call", not "compute the runtime
1663/// dispatch winner". Over-approximating ancestors only matters when two
1664/// ancestors define the same method, and even then the left-first order
1665/// matches C3 on the common patterns
1666/// (`class Sub(Base, Mixin)` → `Sub, Base, Mixin, <Base's ancestors>, <Mixin's ancestors>`).
1667///
1668/// The returned list excludes the start class itself. Each entry appears at
1669/// most once even when reachable through multiple parent chains.
1670fn compute_python_mro<H: std::hash::BuildHasher>(
1671 start: &str,
1672 hierarchy: &HashMap<String, Vec<String>, H>,
1673) -> Vec<String> {
1674 use std::collections::HashSet;
1675 let mut order: Vec<String> = Vec::new();
1676 let mut visited: HashSet<String> = HashSet::new();
1677 // Start with the immediate parents of `start` (the receiver-type's own
1678 // scope was already searched by Priority 2's direct match).
1679 let Some(start_parents) = hierarchy.get(start) else {
1680 return order;
1681 };
1682 // DFS stack: we push in reverse so pop yields left-first order.
1683 let mut stack: Vec<String> = start_parents.iter().rev().cloned().collect();
1684 while let Some(cls) = stack.pop() {
1685 if !visited.insert(cls.clone()) {
1686 continue;
1687 }
1688 order.push(cls.clone());
1689 if let Some(parents) = hierarchy.get(&cls) {
1690 for p in parents.iter().rev() {
1691 if !visited.contains(p) {
1692 stack.push(p.clone());
1693 }
1694 }
1695 }
1696 }
1697 order
1698}
1699
1700// ── Go receiver-type heuristic ────────────────────────────────────────
1701
1702/// Walk the Go parse tree and fill `map` with receiver-type inference.
1703///
1704/// One heuristic case: **`recv.Method()` inside a `method_declaration`**.
1705///
1706/// Go methods have an explicit receiver parameter in their signature:
1707/// `func (r *Foo) Bar() { r.Baz() }` — `r` is bound to type `Foo`.
1708///
1709/// The Go call query captures:
1710/// `(call_expression function: (selector_expression field: (field_identifier) @callee))`
1711///
1712/// Within `selector_expression`, `operand` is the receiver expression and
1713/// `field` is the method name (the `@callee` capture).
1714///
1715/// This function also handles `self.method()` patterns for cases where code
1716/// uses `self` as a receiver name (not idiomatic Go, but it occurs).
1717fn collect_go_receiver_types(
1718 source: &str,
1719 root: tree_sitter::Node<'_>,
1720 map: &mut HashMap<u32, String>,
1721) {
1722 // Stack carries (node, receiver_binding: Option<(recv_name, recv_type)>).
1723 let mut stack: Vec<(tree_sitter::Node<'_>, Option<(String, String)>)> = vec![(root, None)];
1724
1725 while let Some((n, recv_binding)) = stack.pop() {
1726 if n.kind() == "method_declaration" {
1727 // Extract the receiver name and type from the method signature.
1728 let binding = extract_go_receiver_binding(source, n);
1729 let new_binding = binding.or_else(|| recv_binding.clone());
1730 let mut cursor = n.walk();
1731 for child in n.children(&mut cursor) {
1732 stack.push((child, new_binding.clone()));
1733 }
1734 } else {
1735 // For any call_expression whose function is a selector_expression,
1736 // check if the operand matches the active receiver binding.
1737 if n.kind() == "call_expression"
1738 && let Some(func) = n.child_by_field_name("function")
1739 && func.kind() == "selector_expression"
1740 && let (Some(operand), Some(field)) = (
1741 func.child_by_field_name("operand"),
1742 func.child_by_field_name("field"),
1743 )
1744 {
1745 let recv_text = source[operand.start_byte()..operand.end_byte()].to_string();
1746 let receiver_type = recv_binding.as_ref().and_then(|(recv_name, recv_ty)| {
1747 if recv_text == *recv_name {
1748 Some(recv_ty.clone())
1749 } else {
1750 None
1751 }
1752 });
1753
1754 if let Some(ty) = receiver_type {
1755 // The `@callee` capture byte offset is the `field` child.
1756 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
1757 let field_byte = field.start_byte() as u32;
1758 map.insert(field_byte, ty);
1759 }
1760 }
1761
1762 let mut cursor = n.walk();
1763 for child in n.children(&mut cursor) {
1764 stack.push((child, recv_binding.clone()));
1765 }
1766 }
1767 }
1768}
1769
1770/// Extract the receiver name and base type from a Go `method_declaration`.
1771///
1772/// Go method declaration shape (tree-sitter-go):
1773/// ```text
1774/// (method_declaration
1775/// receiver: (parameter_list
1776/// (parameter_declaration
1777/// name: (identifier) ← receiver name
1778/// type: (type_identifier ← receiver type (bare)
1779/// | pointer_type (type_identifier)) ← or *Type
1780/// )
1781/// )
1782/// name: (field_identifier)
1783/// ...
1784/// )
1785/// ```
1786///
1787/// Returns `Some((recv_name, type_name))` or `None` if the receiver is unnamed
1788/// (blank identifier `_`) or has an unrecognisable shape.
1789fn extract_go_receiver_binding(
1790 source: &str,
1791 method_node: tree_sitter::Node<'_>,
1792) -> Option<(String, String)> {
1793 let receiver_list = method_node.child_by_field_name("receiver")?;
1794 // parameter_list contains one parameter_declaration
1795 let mut cursor = receiver_list.walk();
1796 for param in receiver_list.children(&mut cursor) {
1797 if param.kind() == "parameter_declaration" {
1798 let name_node = param.child_by_field_name("name");
1799 let type_node = param.child_by_field_name("type");
1800 if let (Some(nn), Some(tn)) = (name_node, type_node) {
1801 let name = source[nn.start_byte()..nn.end_byte()].to_string();
1802 if name == "_" || name.is_empty() {
1803 return None;
1804 }
1805 let ty = extract_go_base_type(source, tn);
1806 if !ty.is_empty() {
1807 return Some((name, ty));
1808 }
1809 }
1810 }
1811 }
1812 None
1813}
1814
1815/// Extract the base type name from a Go type node.
1816///
1817/// For `Foo` (type_identifier) → `"Foo"`.
1818/// For `*Foo` (pointer_type → type_identifier) → `"Foo"`.
1819fn extract_go_base_type(source: &str, node: tree_sitter::Node<'_>) -> String {
1820 match node.kind() {
1821 "type_identifier" => source[node.start_byte()..node.end_byte()].to_string(),
1822 "pointer_type" => {
1823 // pointer_type has one child: the pointee type
1824 let mut cursor = node.walk();
1825 for child in node.children(&mut cursor) {
1826 if child.kind() == "type_identifier" {
1827 return source[child.start_byte()..child.end_byte()].to_string();
1828 }
1829 let t = extract_go_base_type(source, child);
1830 if !t.is_empty() {
1831 return t;
1832 }
1833 }
1834 String::new()
1835 }
1836 _ => {
1837 let mut cursor = node.walk();
1838 for child in node.children(&mut cursor) {
1839 if child.kind() == "type_identifier" {
1840 return source[child.start_byte()..child.end_byte()].to_string();
1841 }
1842 }
1843 String::new()
1844 }
1845 }
1846}
1847
1848/// Enrich Go `method_declaration` definition scopes with their receiver type name.
1849///
1850/// In the generic `extract_definitions` path, `build_scope_chain` walks the
1851/// *parent* chain of the `@def` node. For Go `method_declaration`, the parent
1852/// is the file root — so the scope is always `""`.
1853///
1854/// An empty scope means `resolve_calls` Priority 2 (receiver-type matching via
1855/// `scope.contains(recv_type)`) never fires for Go methods. Cross-file calls
1856/// where the caller inferred `receiver_type = Some("Foo")` stay unresolved;
1857/// no edge is recorded; `def_callers[]` stays empty for those defs — the root
1858/// cause of the missing inverse index for Go (I#P1).
1859///
1860/// Fix: after `extract_definitions`, parse the Go source a second time to find
1861/// each `method_declaration`'s receiver type, then set the matching def's scope
1862/// to `"method_declaration {ReceiverType}"`. This matches the pattern used by
1863/// the existing `go_resolve_receiver_method_via_signature` integration test,
1864/// which asserts that `scope.contains("Foo")` succeeds when the scope is
1865/// `"method_declaration Foo"`.
1866///
1867/// Matching is by `start_byte` (precise) so name collisions across different
1868/// receiver types are handled correctly.
1869fn enrich_go_method_def_scopes(source: &str, defs: &mut [Definition]) {
1870 let go_lang: tree_sitter::Language = tree_sitter_go::LANGUAGE.into();
1871 let mut parser = Parser::new();
1872 if parser.set_language(&go_lang).is_err() {
1873 return;
1874 }
1875 let Some(tree) = parser.parse(source, None) else {
1876 return;
1877 };
1878
1879 // Walk all top-level method_declaration nodes.
1880 let root = tree.root_node();
1881 let mut method_cursor = root.walk();
1882 for child in root.children(&mut method_cursor) {
1883 if child.kind() != "method_declaration" {
1884 continue;
1885 }
1886 let Some((_, recv_type)) = extract_go_receiver_binding(source, child) else {
1887 continue;
1888 };
1889 // Match by start_byte (precise): the @def node for method_declaration in
1890 // the Go definition query is the method_declaration node itself, so its
1891 // start_byte matches the def's start_byte recorded during extract_definitions.
1892 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
1893 let method_start_byte = child.start_byte() as u32;
1894 for def in defs.iter_mut() {
1895 if def.kind == "method_declaration" && def.start_byte == method_start_byte {
1896 def.scope = format!("method_declaration {recv_type}");
1897 break;
1898 }
1899 }
1900 }
1901}
1902
1903/// Public wrapper for `enrich_go_method_def_scopes` — enables integration tests
1904/// to call it directly without going through the full `build_graph` pipeline.
1905pub fn enrich_go_method_def_scopes_pub(source: &str, defs: &mut [Definition]) {
1906 enrich_go_method_def_scopes(source, defs);
1907}
1908
1909/// SQL: prepend a synthetic whole-file definition whose name is the filename stem.
1910///
1911/// dbt and sqlmesh follow a filename-as-model-name convention:
1912///
1913/// - `silver_issuer_returns.sql` defines the `silver_issuer_returns` model.
1914/// - `gold_issuer_returns.sql` references the silver model by filename stem,
1915/// not by any in-source CREATE TABLE.
1916///
1917/// In sqlmesh, the in-source name is templated:
1918///
1919/// ```sql
1920/// MODEL (
1921/// name @{athena_sqlmesh_silver_schema}.issuer_returns,
1922/// ...
1923/// );
1924/// SELECT ... FROM @{athena_sqlmesh_silver_schema}.stg_issuer_returns;
1925/// ```
1926///
1927/// The `MODEL (...)` header parses as an ERROR node under tree-sitter-sequel
1928/// because `@{var}` is not standard SQL; FROM/JOIN further down the file still
1929/// extract cleanly. Without a synthetic def, `lsp_workspace_symbols(query=
1930/// "silver_issuer_returns")` returns no hits — there is no real CREATE TABLE
1931/// in the file and the model name is interpolation only.
1932///
1933/// This helper prepends a definition with:
1934/// - `name` = filename stem (e.g., `silver_issuer_returns`)
1935/// - `kind` = `"sql_file"` (maps to `LSP SymbolKind::File` in
1936/// [`languages::lsp_symbol_kind_for_node_kind`])
1937/// - byte range = the entire source `[0, source.len())`
1938/// - scope / signature / qualified_name = empty / None
1939///
1940/// The whole-file byte range is the key to FROM/JOIN attribution: when
1941/// `extract_calls` later places a CallRef from a FROM clause that is not
1942/// inside any CTE or other smaller def, the smallest-enclosing-def search
1943/// lands on this synthetic file def and the edge is recorded.
1944///
1945/// If the filename has no stem (empty / `..`), the helper is a no-op.
1946/// Idempotent: if a `sql_file` def already exists at byte 0, it is left alone.
1947pub(crate) fn enrich_sql_file_def(filename: &str, source: &str, defs: &mut Vec<Definition>) {
1948 // Idempotency: do nothing if a sql_file def is already present at byte 0.
1949 if defs
1950 .iter()
1951 .any(|d| d.kind == "sql_file" && d.start_byte == 0)
1952 {
1953 return;
1954 }
1955
1956 // Derive the filename stem (last path component, file extension stripped).
1957 let stem = std::path::Path::new(filename)
1958 .file_stem()
1959 .and_then(|s| s.to_str())
1960 .unwrap_or_default();
1961 if stem.is_empty() {
1962 return;
1963 }
1964
1965 // Count newlines so end_line is reasonable for downstream UI.
1966 let end_line_zero_based = source.bytes().filter(|&b| b == b'\n').count();
1967 #[expect(clippy::cast_possible_truncation, reason = "line counts fit in u32")]
1968 let end_line = (end_line_zero_based as u32) + 1;
1969 #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
1970 let end_byte = source.len() as u32;
1971
1972 let file_def = Definition {
1973 name: stem.to_string(),
1974 kind: "sql_file".to_string(),
1975 start_line: 1,
1976 end_line,
1977 scope: String::new(),
1978 signature: None,
1979 start_byte: 0,
1980 end_byte,
1981 calls: vec![],
1982 };
1983 // Prepend so it remains the outermost (largest) enclosing def at byte 0,
1984 // ensuring narrow CTE defs are still preferred for inner-FROM attribution
1985 // (the smallest-enclosing rule in `extract_calls`).
1986 defs.insert(0, file_def);
1987}
1988
1989/// Public wrapper for [`enrich_sql_file_def`] — enables integration tests
1990/// to call it directly without going through the full `build_graph` pipeline.
1991///
1992/// This is `pub` (not `pub(crate)`) because integration tests in
1993/// `crates/ripvec-core/tests/` are in a separate crate and cannot access
1994/// `pub(crate)` items.
1995pub fn enrich_sql_file_def_pub(filename: &str, source: &str, defs: &mut Vec<Definition>) {
1996 enrich_sql_file_def(filename, source, defs);
1997}
1998
1999/// Public wrapper for `extract_calls` — enables integration tests to call it
2000/// directly without going through the full `build_graph` pipeline.
2001///
2002/// This is `pub` (not `pub(crate)`) because integration tests in
2003/// `crates/ripvec-core/tests/` are in a separate crate and cannot access
2004/// `pub(crate)` items.
2005pub fn extract_calls_pub(
2006 source: &str,
2007 call_config: &languages::CallConfig,
2008 defs: &mut [Definition],
2009) {
2010 extract_calls(source, call_config, defs);
2011}
2012
2013/// Build an index from definition name to list of `DefId`s.
2014#[must_use]
2015pub fn build_def_index_pub(files: &[FileNode]) -> HashMap<String, Vec<DefId>> {
2016 build_def_index(files)
2017}
2018
2019fn build_def_index(files: &[FileNode]) -> HashMap<String, Vec<DefId>> {
2020 let mut index: HashMap<String, Vec<DefId>> = HashMap::new();
2021 for (file_idx, file) in files.iter().enumerate() {
2022 for (def_idx, def) in file.defs.iter().enumerate() {
2023 #[expect(clippy::cast_possible_truncation)]
2024 let did: DefId = (file_idx as u32, def_idx as u16);
2025 index.entry(def.name.clone()).or_default().push(did);
2026 }
2027 }
2028 index
2029}
2030
2031/// Resolve call references to target definitions.
2032///
2033/// Resolution priority:
2034///
2035/// 1. **Qualified path** (`qualified_path = Some("mod_a::foo")`): filter candidates
2036/// by qualifier match (file path or scope contains the qualifier segment). Unique
2037/// match → resolve; ambiguous or no match → leave `None`.
2038/// 2. **Receiver type — direct scope match** (`receiver_type = Some("Foo")`):
2039/// for method calls, prefer candidates whose `scope` contains the receiver
2040/// type name (e.g., `"impl_item Foo"`). Among receiver-matching candidates,
2041/// further prefer those in imported files. Unique match → resolve;
2042/// ambiguous → leave `None`.
2043/// When this step finds nothing on the receiver class itself, sub-step 2b
2044/// (Python MRO walk) runs: when `Foo` has a recorded parent chain, walk
2045/// the receiver class's MRO (left-first DFS) and try the scope-match
2046/// against each ancestor's name. First ancestor with a matching candidate
2047/// wins. See [`compute_python_mro`] for the simplification rationale (it
2048/// diverges from C3 only on diamond shapes where two ancestors define the
2049/// same name).
2050/// 3. **Same file** (unqualified, no receiver): prefer definitions in the caller's
2051/// own file.
2052/// 4. **SQL suffix-match** (sql_file callers only, no exact-name match): when
2053/// the caller def has `kind = "sql_file"` and the bare `call_name` (e.g.,
2054/// `"issuer_returns"`) has no exact entry in the def index, scan all
2055/// `sql_file` defs for names ending with `_<call_name>` (e.g.,
2056/// `"silver_issuer_returns"`). This bridges dbt / sqlmesh layered schema
2057/// prefixes: `gold_issuer_returns.sql` uses `FROM @{schema}.issuer_returns`
2058/// which tree-sitter reduces to the bare name `"issuer_returns"`, while the
2059/// target def is the synthetic `sql_file` def named `"silver_issuer_returns"`.
2060/// Unique suffix-match → resolve; ambiguous (multiple layers match) or
2061/// no match → leave `None`. Non-sql_file callers are explicitly excluded.
2062/// 5. **Imported file** (unqualified, no receiver): check definitions in files this
2063/// file imports. Unique imported candidate → resolve.
2064/// 6. **Ambiguous or unresolved**: leave `resolved` as `None` (no silent first-wins).
2065///
2066/// Equivalent to [`resolve_calls_with_python_mro_pub`] with an empty MRO map
2067/// (Priority 2.5 is a no-op).
2068pub fn resolve_calls_pub<S: std::hash::BuildHasher>(
2069 files: &mut [FileNode],
2070 def_index: &HashMap<String, Vec<DefId>, S>,
2071) {
2072 let empty: HashMap<String, Vec<String>> = HashMap::new();
2073 resolve_calls_inner(files, def_index, &empty);
2074}
2075
2076/// Resolve call references with MRO-aware Python receiver dispatch enabled.
2077///
2078/// Identical to [`resolve_calls_pub`] except that Priority 2.5 (the MRO walk)
2079/// fires when the caller passes a non-empty `python_class_hierarchy`.
2080/// `build_graph` populates the hierarchy by parsing every Python source file
2081/// with [`extract_python_class_hierarchy`] and merging the per-file maps.
2082///
2083/// Tests that want to exercise MRO resolution without going through
2084/// `build_graph` can call this directly with a synthetic hierarchy.
2085pub fn resolve_calls_with_python_mro_pub<S, H>(
2086 files: &mut [FileNode],
2087 def_index: &HashMap<String, Vec<DefId>, S>,
2088 python_class_hierarchy: &HashMap<String, Vec<String>, H>,
2089) where
2090 S: std::hash::BuildHasher,
2091 H: std::hash::BuildHasher,
2092{
2093 resolve_calls_inner(files, def_index, python_class_hierarchy);
2094}
2095
2096fn resolve_calls<S, H>(
2097 files: &mut [FileNode],
2098 def_index: &HashMap<String, Vec<DefId>, S>,
2099 python_class_hierarchy: &HashMap<String, Vec<String>, H>,
2100) where
2101 S: std::hash::BuildHasher,
2102 H: std::hash::BuildHasher,
2103{
2104 resolve_calls_inner(files, def_index, python_class_hierarchy);
2105}
2106
2107#[expect(
2108 clippy::too_many_lines,
2109 reason = "7-priority resolution cascade (qualified path, receiver type, MRO walk, same-file, \
2110 SQL suffix-match, imported-file, ambiguous); each priority is a distinct decision \
2111 branch and extracting helpers would require passing large shared state across boundaries"
2112)]
2113fn resolve_calls_inner<S, H>(
2114 files: &mut [FileNode],
2115 def_index: &HashMap<String, Vec<DefId>, S>,
2116 python_class_hierarchy: &HashMap<String, Vec<String>, H>,
2117) where
2118 S: std::hash::BuildHasher,
2119 H: std::hash::BuildHasher,
2120{
2121 // Pre-compute imported file sets for each file.
2122 let imported_files: Vec<std::collections::HashSet<u32>> = files
2123 .iter()
2124 .map(|f| {
2125 f.imports
2126 .iter()
2127 .filter_map(|imp| imp.resolved_idx)
2128 .collect()
2129 })
2130 .collect();
2131
2132 for file_idx in 0..files.len() {
2133 for def_idx in 0..files[file_idx].defs.len() {
2134 for call_idx in 0..files[file_idx].defs[def_idx].calls.len() {
2135 let call_name = files[file_idx].defs[def_idx].calls[call_idx].name.clone();
2136 let qualified_path = files[file_idx].defs[def_idx].calls[call_idx]
2137 .qualified_path
2138 .clone();
2139 let receiver_type = files[file_idx].defs[def_idx].calls[call_idx]
2140 .receiver_type
2141 .clone();
2142
2143 // HCL: dedicated resolution for `terraform_remote_state.<NAME>`
2144 // and `module.<NAME>` qualified paths. Aurora's module DAG is
2145 // expressed by these patterns; resolve to the first def in any
2146 // file under a `/<NAME>/` directory segment. This is the
2147 // module-source contract (R2 + R3, Wave 3).
2148 if let Some(ref qpath) = qualified_path
2149 && (qpath.starts_with("terraform_remote_state.")
2150 || qpath.starts_with("module."))
2151 {
2152 let target = &call_name; // already the bare module label
2153 let segment_match = format!("/{target}/");
2154 let alt_segment_prefix = format!("{target}/"); // when path starts with target dir
2155 let candidate = files.iter().enumerate().find_map(|(idx, f)| {
2156 if f.path.contains(&segment_match)
2157 || f.path.starts_with(&alt_segment_prefix)
2158 {
2159 // Pick the first def in the file (or skip if file
2160 // has no defs).
2161 if !f.defs.is_empty() {
2162 #[expect(
2163 clippy::cast_possible_truncation,
2164 reason = "file index fits in u32"
2165 )]
2166 {
2167 return Some((idx as u32, 0u16));
2168 }
2169 }
2170 }
2171 None
2172 });
2173 if let Some(did) = candidate {
2174 files[file_idx].defs[def_idx].calls[call_idx].resolved = Some(did);
2175 }
2176 continue;
2177 }
2178
2179 // ── Priority 1: Qualified-path resolution ────────────────
2180 //
2181 // `qualified_path` carries the full scoped path (e.g. "mod_a::foo").
2182 // We look up candidates by the bare `call_name`, then filter by
2183 // whether the file path or scope contains the qualifier prefix.
2184 if let Some(ref qpath) = qualified_path {
2185 // Qualifier is everything before the final `::`.
2186 let qualifier = if let Some(pos) = qpath.rfind("::") {
2187 &qpath[..pos]
2188 } else {
2189 qpath.as_str()
2190 };
2191 let qual_segments: Vec<&str> = qualifier.split("::").collect();
2192
2193 let Some(candidates) = def_index.get(&call_name) else {
2194 continue;
2195 };
2196
2197 let matching: Vec<DefId> = candidates
2198 .iter()
2199 .copied()
2200 .filter(|&(f_idx, _)| {
2201 let file_path = &files[f_idx as usize].path;
2202 let last_segment = qual_segments.last().copied().unwrap_or("");
2203 let path_as_module =
2204 file_path.trim_end_matches(".rs").replace(['/', '\\'], "::");
2205 path_as_module.contains(last_segment)
2206 || file_path.contains(last_segment)
2207 })
2208 .collect();
2209
2210 if matching.len() == 1 {
2211 files[file_idx].defs[def_idx].calls[call_idx].resolved = Some(matching[0]);
2212 }
2213 // Ambiguous or no match → leave None.
2214 continue;
2215 }
2216
2217 // ── Priority 3.5: SQL suffix-match resolution ────────────
2218 //
2219 // dbt / sqlmesh pipelines use layered schema prefixes:
2220 // `silver_issuer_returns.sql` defines the silver-layer model.
2221 // `gold_issuer_returns.sql` references it via a FROM clause
2222 // that tree-sitter parses as `name: "issuer_returns"` (the
2223 // `name:` field-selector strips the `@{schema}.` prefix).
2224 // `def_index.get("issuer_returns")` returns None — the def is
2225 // stored under "silver_issuer_returns".
2226 //
2227 // When: (a) no exact-name candidate exists, AND (b) the
2228 // caller's enclosing def is a sql_file (whole-file synthetic
2229 // def emitted by `enrich_sql_file_def`), walk every sql_file
2230 // def in the graph and check whether its name ends with
2231 // `_<call_name>`. Unique suffix-match → resolve. Ambiguous
2232 // (e.g., both gold_ and silver_ match the same bare name) →
2233 // leave None (no silent first-wins).
2234 //
2235 // Non-sql_file callers are explicitly excluded: a Rust
2236 // function_item whose call_name happens to end with a suffix
2237 // of some sql_file def must NOT be resolved via this path.
2238 if !def_index.contains_key(&call_name)
2239 && files[file_idx].defs[def_idx].kind == "sql_file"
2240 && !call_name.is_empty()
2241 {
2242 let suffix = format!("_{call_name}");
2243 let suffix_str = suffix.as_str();
2244 // Exclude the caller def itself from the suffix scan:
2245 // `gold_issuer_returns` also ends with `_issuer_returns`
2246 // but must not self-resolve.
2247 #[expect(clippy::cast_possible_truncation, reason = "file index fits in u32")]
2248 let caller_did: DefId = (file_idx as u32, def_idx as u16);
2249 let suffix_matches: Vec<DefId> = files
2250 .iter()
2251 .enumerate()
2252 .flat_map(|(f_idx, f)| {
2253 f.defs.iter().enumerate().filter_map(move |(d_idx, d)| {
2254 #[expect(
2255 clippy::cast_possible_truncation,
2256 reason = "file and def indices fit in u32/u16"
2257 )]
2258 let did: DefId = (f_idx as u32, d_idx as u16);
2259 if d.kind == "sql_file"
2260 && d.name.ends_with(suffix_str)
2261 && did != caller_did
2262 {
2263 Some(did)
2264 } else {
2265 None
2266 }
2267 })
2268 })
2269 .collect();
2270 if suffix_matches.len() == 1 {
2271 files[file_idx].defs[def_idx].calls[call_idx].resolved =
2272 Some(suffix_matches[0]);
2273 }
2274 // Ambiguous (>1) or no match (0) → leave None.
2275 continue;
2276 }
2277
2278 let Some(candidates) = def_index.get(&call_name) else {
2279 continue;
2280 };
2281
2282 // ── Priority 2: Receiver-type resolution ─────────────────
2283 //
2284 // `receiver_type = Some("Foo")` means this is a method call on a
2285 // value whose type is `Foo`. Filter candidates to those whose scope
2286 // chain contains the receiver type name.
2287 if let Some(ref rtype) = receiver_type {
2288 // Candidates whose scope contains the receiver type name.
2289 let receiver_matching: Vec<DefId> = candidates
2290 .iter()
2291 .copied()
2292 .filter(|&(f_idx, d_idx)| {
2293 let scope = &files[f_idx as usize].defs[d_idx as usize].scope;
2294 scope.contains(rtype.as_str())
2295 })
2296 .collect();
2297
2298 if receiver_matching.len() == 1 {
2299 files[file_idx].defs[def_idx].calls[call_idx].resolved =
2300 Some(receiver_matching[0]);
2301 continue;
2302 }
2303
2304 if receiver_matching.len() > 1 {
2305 // Among receiver-matching candidates, prefer those in imported files.
2306 let imported_receiver_matching: Vec<DefId> = receiver_matching
2307 .iter()
2308 .copied()
2309 .filter(|(f, _)| imported_files[file_idx].contains(f))
2310 .collect();
2311 if imported_receiver_matching.len() == 1 {
2312 files[file_idx].defs[def_idx].calls[call_idx].resolved =
2313 Some(imported_receiver_matching[0]);
2314 }
2315 // Ambiguous even after import filter → leave None.
2316 continue;
2317 }
2318
2319 // ── Priority 2.5: Python MRO walk ─────────────────────
2320 //
2321 // The receiver type's own scope has no matching def — but
2322 // the method may live on an ancestor class. Walk the MRO
2323 // (left-first DFS) and try the scope-match against each
2324 // ancestor's name. First ancestor with at least one
2325 // scope-matching candidate wins; if multiple candidates
2326 // match for the same ancestor, prefer imported files,
2327 // else take the first in stable order.
2328 //
2329 // Liskov: a subclass's `self.method()` call must dispatch
2330 // through the MRO; over-approximating ancestors is the
2331 // correct conservative move for a reverse call graph.
2332 // For non-Python languages or Python receivers with no
2333 // recorded parents, `compute_python_mro` returns an
2334 // empty vector and this loop is a no-op.
2335 let mro = compute_python_mro(rtype, python_class_hierarchy);
2336 let mut resolved_via_mro: Option<DefId> = None;
2337 for ancestor in &mro {
2338 let ancestor_matching: Vec<DefId> = candidates
2339 .iter()
2340 .copied()
2341 .filter(|&(f_idx, d_idx)| {
2342 let scope = &files[f_idx as usize].defs[d_idx as usize].scope;
2343 scope.contains(ancestor.as_str())
2344 })
2345 .collect();
2346 if ancestor_matching.len() == 1 {
2347 resolved_via_mro = Some(ancestor_matching[0]);
2348 break;
2349 }
2350 if ancestor_matching.len() > 1 {
2351 // Prefer imported files among the ancestor matches.
2352 let imported_ancestor: Vec<DefId> = ancestor_matching
2353 .iter()
2354 .copied()
2355 .filter(|(f, _)| imported_files[file_idx].contains(f))
2356 .collect();
2357 if imported_ancestor.len() == 1 {
2358 resolved_via_mro = Some(imported_ancestor[0]);
2359 break;
2360 }
2361 // Ambiguous at this ancestor — pick the first
2362 // candidate in stable order. The MRO walk's job
2363 // is to find *an* implementing def for an
2364 // inherited call, not to compute the runtime
2365 // dispatch winner; any plausible candidate is
2366 // useful for the reverse call graph.
2367 resolved_via_mro = Some(ancestor_matching[0]);
2368 break;
2369 }
2370 }
2371 if let Some(did) = resolved_via_mro {
2372 files[file_idx].defs[def_idx].calls[call_idx].resolved = Some(did);
2373 continue;
2374 }
2375 // No receiver-matching candidates anywhere in the MRO →
2376 // fall through to bare-name resolution.
2377 }
2378
2379 // ── Priority 3: Same-file resolution ─────────────────────
2380 #[expect(clippy::cast_possible_truncation)]
2381 let file_idx_u32 = file_idx as u32;
2382 if let Some(&did) = candidates.iter().find(|(f, _)| *f == file_idx_u32) {
2383 files[file_idx].defs[def_idx].calls[call_idx].resolved = Some(did);
2384 continue;
2385 }
2386
2387 // ── Priority 4: Imported-file resolution ──────────────────
2388 let imported_candidates: Vec<DefId> = candidates
2389 .iter()
2390 .copied()
2391 .filter(|(f, _)| imported_files[file_idx].contains(f))
2392 .collect();
2393 if imported_candidates.len() == 1 {
2394 files[file_idx].defs[def_idx].calls[call_idx].resolved =
2395 Some(imported_candidates[0]);
2396 }
2397 // Priority 5: Ambiguous or unresolved → leave None.
2398 }
2399 }
2400 }
2401}
2402
2403/// Compute a prefix-sum offset table for flattening `DefId`s to linear indices.
2404fn def_offsets(files: &[FileNode]) -> Vec<usize> {
2405 let mut offsets = Vec::with_capacity(files.len() + 1);
2406 offsets.push(0);
2407 for file in files {
2408 offsets.push(offsets.last().unwrap() + file.defs.len());
2409 }
2410 offsets
2411}
2412
2413/// Flatten a `DefId` to a linear index using the offset table.
2414fn flatten_def_id(offsets: &[usize], did: DefId) -> usize {
2415 offsets[did.0 as usize] + did.1 as usize
2416}
2417
2418/// Build top-N caller and callee lists for each definition (flattened).
2419fn build_def_neighbor_lists(
2420 n: usize,
2421 edges: &[(u32, u32, u32)],
2422 offsets: &[usize],
2423) -> (Vec<Vec<DefId>>, Vec<Vec<DefId>>) {
2424 let mut incoming: Vec<Vec<(u32, u32)>> = vec![vec![]; n];
2425 let mut outgoing: Vec<Vec<(u32, u32)>> = vec![vec![]; n];
2426
2427 for &(src, dst, w) in edges {
2428 let (s, d) = (src as usize, dst as usize);
2429 if s < n && d < n {
2430 incoming[d].push((src, w));
2431 outgoing[s].push((dst, w));
2432 }
2433 }
2434
2435 // Convert flat index back to DefId
2436 let to_def_id = |flat: u32| -> DefId {
2437 let flat_usize = flat as usize;
2438 let file_idx = offsets.partition_point(|&o| o <= flat_usize) - 1;
2439 let def_idx = flat_usize - offsets[file_idx];
2440 #[expect(clippy::cast_possible_truncation)]
2441 (file_idx as u32, def_idx as u16)
2442 };
2443
2444 let callers = incoming
2445 .into_iter()
2446 .map(|mut v| {
2447 v.sort_by_key(|b| std::cmp::Reverse(b.1));
2448 v.truncate(MAX_NEIGHBORS);
2449 v.into_iter().map(|(idx, _)| to_def_id(idx)).collect()
2450 })
2451 .collect();
2452
2453 let callees = outgoing
2454 .into_iter()
2455 .map(|mut v| {
2456 v.sort_by_key(|b| std::cmp::Reverse(b.1));
2457 v.truncate(MAX_NEIGHBORS);
2458 v.into_iter().map(|(idx, _)| to_def_id(idx)).collect()
2459 })
2460 .collect();
2461
2462 (callers, callees)
2463}
2464
2465// ── PageRank ─────────────────────────────────────────────────────────
2466
2467/// Compute `PageRank` scores for a graph.
2468///
2469/// If `focus` is `Some(idx)`, computes topic-sensitive `PageRank` biased
2470/// toward file `idx`. Otherwise computes standard (uniform) `PageRank`.
2471///
2472/// Returns one score per node, summing to 1.0.
2473#[expect(
2474 clippy::cast_precision_loss,
2475 reason = "node count fits comfortably in f32"
2476)]
2477fn pagerank(n: usize, edges: &[(u32, u32, u32)], focus: Option<usize>) -> Vec<f32> {
2478 if n == 0 {
2479 return vec![];
2480 }
2481
2482 // Build adjacency: out_edges[src] = [(dst, weight)]
2483 let mut out_edges: Vec<Vec<(usize, f32)>> = vec![vec![]; n];
2484 let mut out_weight: Vec<f32> = vec![0.0; n];
2485
2486 for &(src, dst, w) in edges {
2487 let (s, d) = (src as usize, dst as usize);
2488 if s < n && d < n {
2489 #[expect(clippy::cast_possible_truncation, reason = "edge weights are small")]
2490 let wf = f64::from(w) as f32;
2491 out_edges[s].push((d, wf));
2492 out_weight[s] += wf;
2493 }
2494 }
2495
2496 // Personalization vector (Haveliwala 2002, topic-sensitive PageRank).
2497 //
2498 // When a focus file is specified, the teleportation distribution is split:
2499 // - PERSONALIZATION_ALPHA (0.15) concentrated on the focus node.
2500 // - (1 - PERSONALIZATION_ALPHA) = 0.85 spread uniformly over the
2501 // remaining (n - 1) other nodes.
2502 //
2503 // This gives the focus file a gentle bias over its neighbors without
2504 // collapsing every other file to an equal uniform floor. The resulting
2505 // ranks still vary across the corpus, so the caller sees a *neighborhood*
2506 // of semantically related files rebiased toward the focus (I#16 fix).
2507 //
2508 // For n == 1 there are no other nodes; the focus gets all mass (= 1.0).
2509 let bias: Vec<f32> = if let Some(idx) = focus {
2510 if n == 1 {
2511 vec![1.0_f32]
2512 } else {
2513 let other_mass = (1.0_f32 - PERSONALIZATION_ALPHA) / (n as f32 - 1.0);
2514 let mut b = vec![other_mass; n];
2515 if idx < n {
2516 b[idx] = PERSONALIZATION_ALPHA;
2517 }
2518 // Verify sum ≈ 1.0 (should hold by construction; normalization
2519 // guards against floating-point drift on very large graphs).
2520 let sum: f32 = b.iter().sum();
2521 for v in &mut b {
2522 *v /= sum;
2523 }
2524 b
2525 }
2526 } else {
2527 vec![1.0 / n as f32; n]
2528 };
2529
2530 let mut rank = vec![1.0 / n as f32; n];
2531 let mut next_rank = vec![0.0_f32; n];
2532
2533 for _ in 0..MAX_ITERATIONS {
2534 // Collect dangling mass (nodes with no outgoing edges)
2535 let dangling: f32 = rank
2536 .iter()
2537 .enumerate()
2538 .filter(|&(i, _)| out_edges[i].is_empty())
2539 .map(|(_, &r)| r)
2540 .sum();
2541
2542 // Distribute rank
2543 for (i, nr) in next_rank.iter_mut().enumerate() {
2544 *nr = (1.0 - DAMPING).mul_add(bias[i], DAMPING * dangling * bias[i]);
2545 }
2546
2547 for (src, edges_list) in out_edges.iter().enumerate() {
2548 if edges_list.is_empty() {
2549 continue;
2550 }
2551 let src_rank = rank[src];
2552 let total_w = out_weight[src];
2553 for &(dst, w) in edges_list {
2554 next_rank[dst] += DAMPING * src_rank * (w / total_w);
2555 }
2556 }
2557
2558 // Check convergence
2559 let diff: f32 = rank
2560 .iter()
2561 .zip(next_rank.iter())
2562 .map(|(a, b)| (a - b).abs())
2563 .sum();
2564
2565 std::mem::swap(&mut rank, &mut next_rank);
2566
2567 if diff < EPSILON {
2568 break;
2569 }
2570 }
2571
2572 rank
2573}
2574
2575// ── Graph Building ───────────────────────────────────────────────────
2576
2577/// Intermediate result from definition-level graph computation.
2578struct DefGraphData {
2579 def_edges: Vec<(DefId, DefId, u32)>,
2580 def_ranks: Vec<f32>,
2581 def_callers: Vec<Vec<DefId>>,
2582 def_callees: Vec<Vec<DefId>>,
2583 offsets: Vec<usize>,
2584 base_ranks: Vec<f32>,
2585 file_edges: Vec<(u32, u32, u32)>,
2586}
2587
2588/// Build bidirectional trait↔impl method edges for PageRank propagation (G3).
2589///
2590/// For every impl method that overrides a trait method, adds:
2591/// - `(impl_def_id, trait_def_id, 1)` — impl → trait
2592/// - `(trait_def_id, impl_def_id, 1)` — trait → impl
2593///
2594/// Detection heuristic: an impl method "overrides" a trait method when:
2595/// - The impl method's kind is `"function_item"` and its `scope` starts with
2596/// `"impl_item"`.
2597/// - The trait method's kind is `"function_signature_item"` and its `scope`
2598/// starts with `"trait_item"`.
2599/// - Both have the same `name`.
2600/// - The impl's file imports the trait's file (or they share a file).
2601///
2602/// This is heuristic, not sound: it may produce false positives when two
2603/// unrelated traits define methods with the same name. The practical false-
2604/// positive rate on real Rust codebases is low because method names are
2605/// usually unique within a crate.
2606#[must_use]
2607pub fn build_trait_impl_edges_pub(files: &[FileNode]) -> Vec<(DefId, DefId, u32)> {
2608 build_trait_impl_edges(files)
2609}
2610
2611fn build_trait_impl_edges(files: &[FileNode]) -> Vec<(DefId, DefId, u32)> {
2612 // Build index: method_name → list of (DefId, is_trait_method).
2613 // trait method: kind == "function_signature_item" (abstract) OR scope contains "trait_item".
2614 // impl method: kind == "function_item" AND scope contains "impl_item".
2615 let mut trait_methods: HashMap<String, Vec<DefId>> = HashMap::new();
2616 let mut impl_methods: HashMap<String, Vec<DefId>> = HashMap::new();
2617
2618 for (fi, file) in files.iter().enumerate() {
2619 for (di, def) in file.defs.iter().enumerate() {
2620 #[expect(clippy::cast_possible_truncation)]
2621 let did: DefId = (fi as u32, di as u16);
2622 if def.kind == "function_signature_item"
2623 || (def.scope.starts_with("trait_item") && def.kind == "function_item")
2624 {
2625 trait_methods.entry(def.name.clone()).or_default().push(did);
2626 } else if def.kind == "function_item" && def.scope.starts_with("impl_item") {
2627 impl_methods.entry(def.name.clone()).or_default().push(did);
2628 }
2629 }
2630 }
2631
2632 // Pre-build imported-files sets to restrict matching.
2633 let imported_sets: Vec<std::collections::HashSet<u32>> = files
2634 .iter()
2635 .map(|f| {
2636 f.imports
2637 .iter()
2638 .filter_map(|imp| imp.resolved_idx)
2639 .collect()
2640 })
2641 .collect();
2642
2643 let mut edges: Vec<(DefId, DefId, u32)> = Vec::new();
2644
2645 for (name, trait_defs) in &trait_methods {
2646 let Some(impl_defs) = impl_methods.get(name) else {
2647 continue;
2648 };
2649 for &(tf, td) in trait_defs {
2650 for &(imf, imd) in impl_defs {
2651 // The impl file must import the trait file (or be the same file).
2652 let connected = tf == imf
2653 || imported_sets
2654 .get(imf as usize)
2655 .is_some_and(|s| s.contains(&tf));
2656 if connected {
2657 let trait_id: DefId = (tf, td);
2658 let impl_id: DefId = (imf, imd);
2659 edges.push((trait_id, impl_id, 1));
2660 edges.push((impl_id, trait_id, 1));
2661 }
2662 }
2663 }
2664 }
2665
2666 edges
2667}
2668
2669/// Build definition-level edges, compute `PageRank`, and derive file-level data.
2670fn compute_def_graph(files: &[FileNode]) -> DefGraphData {
2671 // Build definition-level edge list from resolved calls
2672 let mut def_edge_map: HashMap<(DefId, DefId), u32> = HashMap::new();
2673 for (file_idx, file) in files.iter().enumerate() {
2674 for (def_idx, def) in file.defs.iter().enumerate() {
2675 #[expect(clippy::cast_possible_truncation)]
2676 let caller_id: DefId = (file_idx as u32, def_idx as u16);
2677 for call in &def.calls {
2678 if let Some(callee_id) = call.resolved {
2679 *def_edge_map.entry((caller_id, callee_id)).or_insert(0) += 1;
2680 }
2681 }
2682 }
2683 }
2684
2685 // Add trait↔impl bidirectional edges (G3).
2686 let trait_impl_edges = build_trait_impl_edges(files);
2687 for (src, dst, w) in trait_impl_edges {
2688 *def_edge_map.entry((src, dst)).or_insert(0) += w;
2689 }
2690
2691 let def_edges: Vec<(DefId, DefId, u32)> = def_edge_map
2692 .into_iter()
2693 .map(|((src, dst), w)| (src, dst, w))
2694 .collect();
2695
2696 // Compute def-level PageRank
2697 let offsets = def_offsets(files);
2698 let n_defs = *offsets.last().unwrap_or(&0);
2699
2700 let flat_def_edges: Vec<(u32, u32, u32)> = def_edges
2701 .iter()
2702 .map(|(src, dst, w)| {
2703 #[expect(clippy::cast_possible_truncation)]
2704 (
2705 flatten_def_id(&offsets, *src) as u32,
2706 flatten_def_id(&offsets, *dst) as u32,
2707 *w,
2708 )
2709 })
2710 .collect();
2711
2712 let def_ranks = pagerank(n_defs, &flat_def_edges, None);
2713
2714 // Aggregate def ranks to file level
2715 let base_ranks: Vec<f32> = files
2716 .iter()
2717 .enumerate()
2718 .map(|(i, file)| {
2719 let start = offsets[i];
2720 let end = start + file.defs.len();
2721 def_ranks[start..end].iter().sum()
2722 })
2723 .collect();
2724
2725 // Derive file-level edges from def-level call edges
2726 let mut file_edge_map: HashMap<(u32, u32), u32> = HashMap::new();
2727 for &(src, dst, w) in &def_edges {
2728 let src_file = src.0;
2729 let dst_file = dst.0;
2730 if src_file != dst_file {
2731 *file_edge_map.entry((src_file, dst_file)).or_insert(0) += w;
2732 }
2733 }
2734 let file_edges: Vec<(u32, u32, u32)> = file_edge_map
2735 .into_iter()
2736 .map(|((src, dst), w)| (src, dst, w))
2737 .collect();
2738
2739 // Build def-level caller/callee lists
2740 let (def_callers, def_callees) = build_def_neighbor_lists(n_defs, &flat_def_edges, &offsets);
2741
2742 DefGraphData {
2743 def_edges,
2744 def_ranks,
2745 def_callers,
2746 def_callees,
2747 offsets,
2748 base_ranks,
2749 file_edges,
2750 }
2751}
2752
2753/// Build a dependency graph from a repository root.
2754///
2755/// Walks the directory tree, parses each supported file with tree-sitter,
2756/// extracts definitions and imports, resolves import paths to files, runs
2757/// `PageRank`, and builds caller/callee lists.
2758///
2759/// # Errors
2760///
2761/// Returns an error if file walking or reading fails.
2762#[expect(
2763 clippy::too_many_lines,
2764 reason = "three-phase parallel pipeline (walk+filter, def+import extraction, call extraction) \
2765 plus resolve + graph build; phases share state (file_index, raw_sources) and \
2766 cannot be meaningfully split without passing large mutable structures across \
2767 boundaries with no clarity gain"
2768)]
2769pub fn build_graph(root: &Path) -> crate::Result<RepoGraph> {
2770 let root = root.canonicalize().map_err(|e| crate::Error::Io {
2771 path: root.display().to_string(),
2772 source: e,
2773 })?;
2774
2775 let mut walk_options = walk::WalkOptions::default();
2776 if let Some((_, config)) = crate::cache::config::find_config(&root) {
2777 walk_options.ignore_patterns = config.ignore.patterns;
2778 }
2779 let all_files = walk::collect_files_with_options(&root, &walk_options);
2780
2781 // Phase 1: parallel filter + read. For each candidate path with a
2782 // supported extension, read its source from disk and emit a tuple
2783 // alongside its relative path. rayon spreads the I/O cost across
2784 // worker threads; on a 1M-file corpus this was ~20s sequential and
2785 // now sits in the 2-3s range bounded by disk + filter throughput.
2786 let raw_inputs: Vec<(PathBuf, String, String, String)> = all_files
2787 .par_iter()
2788 .filter_map(|path| {
2789 let ext = path
2790 .extension()
2791 .and_then(|e| e.to_str())
2792 .unwrap_or_default()
2793 .to_string();
2794 if languages::config_for_extension(&ext).is_none()
2795 && import_query_for_extension(&ext).is_none()
2796 {
2797 return None;
2798 }
2799 let source = std::fs::read_to_string(path).ok()?;
2800 let rel_path = path
2801 .strip_prefix(&root)
2802 .unwrap_or(path)
2803 .display()
2804 .to_string();
2805 Some((path.clone(), rel_path, ext, source))
2806 })
2807 .collect();
2808
2809 // Build the contiguous `files` Vec and the absolute-path -> idx
2810 // lookup. Sequential because both want stable indices that match
2811 // `raw_sources`'s order; the per-file work this gates is trivial.
2812 let mut file_index: HashMap<PathBuf, usize> = HashMap::with_capacity(raw_inputs.len());
2813 let mut files: Vec<FileNode> = Vec::with_capacity(raw_inputs.len());
2814 let mut raw_sources: Vec<(usize, String, String)> = Vec::with_capacity(raw_inputs.len());
2815 for (idx, (abs_path, rel_path, ext, source)) in raw_inputs.into_iter().enumerate() {
2816 file_index.insert(abs_path, idx);
2817 files.push(FileNode {
2818 path: rel_path,
2819 defs: vec![],
2820 imports: vec![],
2821 });
2822 raw_sources.push((idx, ext, source));
2823 }
2824
2825 // Phase 2: parallel per-file definition + import extraction. Each
2826 // file's tree-sitter parse + def/import queries are independent;
2827 // par_iter_mut over files.iter_mut().zip(raw_sources.par_iter())
2828 // lets every rayon worker grind its own slice. The closures here
2829 // borrow `&root` and `&file_index` immutably (both Sync) and write
2830 // disjoint `FileNode` slots via the &mut iterator.
2831 files
2832 .par_iter_mut()
2833 .zip(raw_sources.par_iter())
2834 .for_each(|(file, (_, ext, source))| {
2835 if let Some(config) = languages::config_for_extension(ext) {
2836 file.defs = extract_definitions(source, &config);
2837 // Go method_declaration scopes are empty after the generic
2838 // extract_definitions pass (the method is a top-level node
2839 // with no structural parent in CONTAINER_KINDS). Enrich them
2840 // with the receiver type so that resolve_calls Priority 2
2841 // (scope.contains(recv_type)) fires correctly for cross-file
2842 // Go receiver-method calls. This populates def_callers[] for
2843 // Go in compute_def_graph (P1 fix).
2844 if languages::is_go_language(&config.language) {
2845 enrich_go_method_def_scopes(source, &mut file.defs);
2846 }
2847 // SQL: prepend a synthetic file-level def named after the
2848 // filename stem (dbt/sqlmesh convention). The whole-file
2849 // byte range becomes the smallest-enclosing fallback for
2850 // FROM/JOIN call-edges that are not inside any CTE, which
2851 // is the resolution target for cross-model references
2852 // (S1, Wave 4). file.path is relative to the repo root and
2853 // is what file_stem() needs to derive the model name.
2854 if languages::is_sql_language(&config.language) {
2855 enrich_sql_file_def(&file.path, source, &mut file.defs);
2856 }
2857 }
2858 if let Some((lang, import_query)) = import_query_for_extension(ext) {
2859 let raw_imports = extract_imports(source, &lang, &import_query);
2860 let file_path = root.join(&file.path);
2861 file.imports = raw_imports
2862 .into_iter()
2863 .map(|raw| {
2864 let resolved_idx =
2865 resolve_import(&raw, ext, &file_path, &root, &file_index)
2866 .and_then(|i| u32::try_from(i).ok());
2867 ImportRef {
2868 raw_path: raw,
2869 resolved_idx,
2870 }
2871 })
2872 .collect();
2873 }
2874 });
2875
2876 // Phase 3: parallel per-file call extraction. Mutates each
2877 // FileNode's `defs[*].calls` independently. Aligned with
2878 // raw_sources by index via the zip.
2879 files
2880 .par_iter_mut()
2881 .zip(raw_sources.par_iter())
2882 .for_each(|(file, (_, ext, source))| {
2883 if let Some(call_config) = languages::call_query_for_extension(ext) {
2884 extract_calls(source, &call_config, &mut file.defs);
2885 }
2886 });
2887
2888 // Build the Python class hierarchy (class_name → parent class names) by
2889 // walking every Python source file. The map is used by `resolve_calls`
2890 // Priority 2.5 to dispatch `self.method()` calls through the MRO when
2891 // the method lives on a parent / mixin class (Q1, Wave 2).
2892 //
2893 // Parallel: extract_python_class_hierarchy is pure per-file, then we
2894 // fold the per-file maps into one global map sequentially because
2895 // HashMap is not lock-free. On a 1k-Python-file corpus this fold takes
2896 // <10ms — much smaller than the parallel parse work that feeds it.
2897 let python_hierarchies: Vec<HashMap<String, Vec<String>>> = raw_sources
2898 .par_iter()
2899 .map(|(_, ext, source)| {
2900 if ext == "py" || ext == "pyi" {
2901 extract_python_class_hierarchy(source)
2902 } else {
2903 HashMap::new()
2904 }
2905 })
2906 .collect();
2907 let mut python_class_hierarchy: HashMap<String, Vec<String>> = HashMap::new();
2908 for local in python_hierarchies {
2909 for (k, v) in local {
2910 // First declaration wins on name collisions across files. The
2911 // MRO walk only needs a plausible parent chain to find an
2912 // ancestor's methods; this is conservative but acceptable.
2913 python_class_hierarchy.entry(k).or_insert(v);
2914 }
2915 }
2916
2917 // Resolve call references to target definitions
2918 let def_index = build_def_index(&files);
2919 resolve_calls(&mut files, &def_index, &python_class_hierarchy);
2920
2921 // Build def-level graph, compute PageRank, and derive file-level data
2922 let graph_data = compute_def_graph(&files);
2923
2924 // Build file-level caller/callee lists
2925 let n = files.len();
2926 let (callers, callees) = build_neighbor_lists(n, &graph_data.file_edges);
2927
2928 // Auto-tune alpha based on graph density
2929 #[expect(clippy::cast_precision_loss, reason = "graph sizes fit in f32")]
2930 let density = if n > 1 {
2931 graph_data.file_edges.len() as f32 / (n as f32 * (n as f32 - 1.0))
2932 } else {
2933 0.0
2934 };
2935 let alpha = 0.3f32.mul_add(density.min(1.0), 0.5);
2936
2937 Ok(RepoGraph {
2938 files,
2939 edges: graph_data.file_edges,
2940 base_ranks: graph_data.base_ranks,
2941 callers,
2942 callees,
2943 def_edges: graph_data.def_edges,
2944 def_ranks: graph_data.def_ranks,
2945 def_callers: graph_data.def_callers,
2946 def_callees: graph_data.def_callees,
2947 def_offsets: graph_data.offsets,
2948 alpha,
2949 })
2950}
2951
2952/// Build a `RepoGraph` directly from a pre-constructed `Vec<FileNode>`.
2953///
2954/// Skips the filesystem walk phase of [`build_graph`]; useful for integration
2955/// tests that want to build synthetic graphs without touching disk.
2956///
2957/// Resolves calls, builds the def-level graph (including G3 trait↔impl edges),
2958/// computes `PageRank`, and builds caller/callee lists.
2959#[must_use]
2960pub fn build_graph_from_files_pub(mut files: Vec<FileNode>) -> RepoGraph {
2961 let def_index = build_def_index(&files);
2962 // No source is available at this entry point, so the Python class
2963 // hierarchy is empty and the MRO walk (Priority 2.5) is a no-op.
2964 // Tests that need MRO resolution should drive `resolve_calls_with_python_mro_pub`
2965 // directly and then call `build_graph_from_files_pub` for the rest of the pipeline.
2966 let empty_hierarchy: HashMap<String, Vec<String>> = HashMap::new();
2967 resolve_calls(&mut files, &def_index, &empty_hierarchy);
2968 let graph_data = compute_def_graph(&files);
2969 let n = files.len();
2970 let (callers, callees) = build_neighbor_lists(n, &graph_data.file_edges);
2971
2972 #[expect(clippy::cast_precision_loss, reason = "graph sizes fit in f32")]
2973 let density = if n > 1 {
2974 graph_data.file_edges.len() as f32 / (n as f32 * (n as f32 - 1.0))
2975 } else {
2976 0.0
2977 };
2978 let alpha = 0.3f32.mul_add(density.min(1.0), 0.5);
2979
2980 RepoGraph {
2981 files,
2982 edges: graph_data.file_edges,
2983 base_ranks: graph_data.base_ranks,
2984 callers,
2985 callees,
2986 def_edges: graph_data.def_edges,
2987 def_ranks: graph_data.def_ranks,
2988 def_callers: graph_data.def_callers,
2989 def_callees: graph_data.def_callees,
2990 def_offsets: graph_data.offsets,
2991 alpha,
2992 }
2993}
2994
2995impl RepoGraph {
2996 /// Get the `PageRank` score for a specific definition.
2997 #[must_use]
2998 pub fn def_rank(&self, did: DefId) -> f32 {
2999 let flat = self.def_offsets[did.0 as usize] + did.1 as usize;
3000 self.def_ranks.get(flat).copied().unwrap_or(0.0)
3001 }
3002
3003 /// Look up a definition by file path and name. Returns the first match.
3004 #[must_use]
3005 pub fn find_def(&self, file_path: &str, def_name: &str) -> Option<DefId> {
3006 for (file_idx, file) in self.files.iter().enumerate() {
3007 if file.path == file_path {
3008 for (def_idx, def) in file.defs.iter().enumerate() {
3009 if def.name == def_name {
3010 #[expect(clippy::cast_possible_truncation)]
3011 return Some((file_idx as u32, def_idx as u16));
3012 }
3013 }
3014 }
3015 }
3016 None
3017 }
3018
3019 /// Resolve a caller-supplied `focus_file` string to a file index in [`Self::files`].
3020 ///
3021 /// Accepts any of the path forms that ripvec itself emits or accepts:
3022 ///
3023 /// - **Exact stored path** (`device_opt/services/storage.py`) — direct match.
3024 /// - **LSP-shaped path** (`./device_opt/services/storage.py`) — the `./`
3025 /// prefix used by every [`RepoMapLspLocation::file_path`] is stripped
3026 /// before comparison so the documented chaining pattern
3027 /// `get_repo_map(focus_file=hits[0].lsp_location.file_path)` works.
3028 /// - **Strict suffix** (`storage.py`, `services/storage.py`) — match when
3029 /// the previous character in the stored path is `/`. Avoids matching
3030 /// `foo_storage.py` for `storage.py`.
3031 ///
3032 /// Returns [`FocusResolution::Found`] when exactly one file matches,
3033 /// [`FocusResolution::Ambiguous`] when multiple files match (the caller
3034 /// surfaces the candidate list to the user), and [`FocusResolution::NotFound`]
3035 /// when no file matches.
3036 ///
3037 /// # Background
3038 ///
3039 /// Prior to this helper the MCP layer (`crates/ripvec-mcp/src/tools.rs`)
3040 /// did the matching inline with two bugs:
3041 ///
3042 /// 1. **`./` prefix mismatch.** [`RepoMapLspLocation::file_path`] always
3043 /// carries a leading `./` (see [`file_lsp_location`]), but
3044 /// [`FileNode::path`] does not. Passing the LSP location verbatim as
3045 /// `focus_file` matched zero files. The matcher silently returned
3046 /// `focus = None`, producing rank values bit-identical to the unfocused
3047 /// call — the bug originally reported as "I#20 focus_file rebias
3048 /// invisible on Python".
3049 /// 2. **Equal-length false negative.** When the user passed
3050 /// `./device_opt/services/storage.py` and the stored path was
3051 /// `device_opt/services/storage.py`, `exact` was false (the strings
3052 /// differ by two bytes) and `strict_suffix` was false (the focus is
3053 /// longer than the stored path, so `p.len() > focus.len()` fails). The
3054 /// pathology surfaced specifically when the focus was a *full* path
3055 /// with the LSP `./` prefix.
3056 ///
3057 /// Centralising the resolution here gives every caller the same
3058 /// normalization-tolerant semantics and one place to test the contract.
3059 #[must_use]
3060 pub fn resolve_focus_file(&self, focus: &str) -> FocusResolution {
3061 let normalized = normalize_focus_path(focus);
3062 let matches: Vec<usize> = self
3063 .files
3064 .iter()
3065 .enumerate()
3066 .filter_map(|(idx, f)| {
3067 if focus_matches_path(&f.path, normalized) {
3068 Some(idx)
3069 } else {
3070 None
3071 }
3072 })
3073 .collect();
3074 match matches.len() {
3075 0 => FocusResolution::NotFound,
3076 1 => FocusResolution::Found(matches[0]),
3077 _ => FocusResolution::Ambiguous(
3078 matches
3079 .into_iter()
3080 .map(|i| self.files[i].path.clone())
3081 .collect(),
3082 ),
3083 }
3084 }
3085}
3086
3087/// Result of resolving a user-supplied `focus_file` string against a [`RepoGraph`].
3088///
3089/// See [`RepoGraph::resolve_focus_file`] for the resolution semantics and the
3090/// historical bug that motivated the helper.
3091#[derive(Debug, Clone)]
3092pub enum FocusResolution {
3093 /// Exactly one file matched. Carries the file index in [`RepoGraph::files`].
3094 Found(usize),
3095 /// No file matched. The caller treats this as an unfocused call.
3096 NotFound,
3097 /// Two or more files matched. The caller surfaces the candidate list so
3098 /// the user can disambiguate by passing a longer suffix or the full path.
3099 Ambiguous(Vec<String>),
3100}
3101
3102/// Strip the leading `./` prefix from a focus_file path.
3103///
3104/// The `./` form is produced by [`file_lsp_location`] for every
3105/// [`RepoMapLspLocation::file_path`] field on a relative path. Stripping it
3106/// gives a stored-path-shaped value for the suffix matcher to compare
3107/// against [`FileNode::path`] entries (which do not carry the prefix).
3108///
3109/// Absolute paths (`/abs/path/file.py`) are returned unchanged; they will
3110/// fail the suffix match against the relative stored paths, which is the
3111/// correct behavior (the caller meant a different root entirely).
3112fn normalize_focus_path(focus: &str) -> &str {
3113 focus.strip_prefix("./").unwrap_or(focus)
3114}
3115
3116/// Return true when `focus` matches `stored_path` as either an exact path or
3117/// a strict-suffix (must be preceded by `/`). The empty focus does not match.
3118fn focus_matches_path(stored_path: &str, focus: &str) -> bool {
3119 if focus.is_empty() {
3120 return false;
3121 }
3122 if stored_path == focus {
3123 return true;
3124 }
3125 stored_path.len() > focus.len()
3126 && stored_path.ends_with(focus)
3127 && stored_path.as_bytes()[stored_path.len() - focus.len() - 1] == b'/'
3128}
3129
3130/// Build top-N caller and callee lists for each file.
3131///
3132/// Given a list of weighted directed edges `(src, dst, weight)` over `n`
3133/// nodes, returns `(callers[i], callees[i])` for each node `i`, where each
3134/// list contains the top-[`MAX_NEIGHBORS`] adjacent nodes sorted by descending
3135/// edge weight.
3136///
3137/// Exposed as `pub` so that integration tests can construct synthetic
3138/// [`RepoGraph`] instances for unit-testing the JSON rendering without going
3139/// through a full disk walk.
3140#[must_use]
3141pub fn build_neighbor_lists(n: usize, edges: &[(u32, u32, u32)]) -> (Vec<Vec<u32>>, Vec<Vec<u32>>) {
3142 let mut incoming: Vec<Vec<(u32, u32)>> = vec![vec![]; n];
3143 let mut outgoing: Vec<Vec<(u32, u32)>> = vec![vec![]; n];
3144
3145 for &(src, dst, w) in edges {
3146 let (s, d) = (src as usize, dst as usize);
3147 if s < n && d < n {
3148 incoming[d].push((src, w));
3149 outgoing[s].push((dst, w));
3150 }
3151 }
3152
3153 // Sort by weight descending, keep top N
3154 let trim = |lists: &mut [Vec<(u32, u32)>]| -> Vec<Vec<u32>> {
3155 lists
3156 .iter_mut()
3157 .map(|list| {
3158 list.sort_by_key(|b| std::cmp::Reverse(b.1));
3159 list.iter()
3160 .take(MAX_NEIGHBORS)
3161 .map(|(idx, _)| *idx)
3162 .collect()
3163 })
3164 .collect()
3165 };
3166
3167 (trim(&mut incoming), trim(&mut outgoing))
3168}
3169
3170// ── Rendering ────────────────────────────────────────────────────────
3171
3172/// Render a budget-constrained overview of the repository.
3173///
3174/// Files are sorted by `PageRank` (or topic-sensitive rank if `focus` is
3175/// `Some`). Output uses four tiers of decreasing detail:
3176///
3177/// - **Tier 0** (top 10%): full path, rank, callers/callees, signatures with scopes
3178/// - **Tier 1** (next 20%): full path, rank, signatures
3179/// - **Tier 2** (next 40%): full path, rank, definition names and kinds
3180/// - **Tier 3** (bottom 30%): file path only
3181///
3182/// Stops accumulating output when the estimated token count exceeds
3183/// `max_tokens`.
3184#[must_use]
3185pub fn render(graph: &RepoGraph, max_tokens: usize, focus: Option<usize>) -> String {
3186 let n = graph.files.len();
3187 if n == 0 {
3188 return String::new();
3189 }
3190
3191 // Compute ranks (recompute topic-sensitive if focus is given)
3192 let ranks = if focus.is_some() {
3193 pagerank(n, &graph.edges, focus)
3194 } else {
3195 graph.base_ranks.clone()
3196 };
3197
3198 // Sort file indices by rank descending
3199 let mut sorted: Vec<usize> = (0..n).collect();
3200 sorted.sort_by(|&a, &b| ranks[b].total_cmp(&ranks[a]));
3201
3202 let mut output = String::new();
3203 let mut used_tokens = 0;
3204 let max_chars = max_tokens * CHARS_PER_TOKEN;
3205
3206 for (rank_pos, &file_idx) in sorted.iter().enumerate() {
3207 if used_tokens >= max_tokens {
3208 break;
3209 }
3210
3211 let file = &graph.files[file_idx];
3212 let score = ranks[file_idx];
3213 #[expect(clippy::cast_precision_loss, reason = "file counts fit in f32")]
3214 let percentile = (rank_pos as f32) / (n as f32);
3215
3216 let section = if percentile < 0.1 {
3217 render_tier0(graph, file_idx, file, score)
3218 } else if percentile < 0.3 {
3219 render_tier1(file, score)
3220 } else if percentile < 0.7 {
3221 render_tier2(file, score)
3222 } else {
3223 render_tier3(file)
3224 };
3225
3226 let section_chars = section.len();
3227 if used_tokens > 0 && used_tokens + section_chars / CHARS_PER_TOKEN > max_tokens {
3228 // Would exceed budget — try to fit at least the path
3229 let path_line = format!("{}\n", file.path);
3230 let path_tokens = path_line.len() / CHARS_PER_TOKEN;
3231 if used_tokens + path_tokens <= max_tokens {
3232 output.push_str(&path_line);
3233 }
3234 break;
3235 }
3236
3237 output.push_str(§ion);
3238 used_tokens = output.len().min(max_chars) / CHARS_PER_TOKEN;
3239 }
3240
3241 output
3242}
3243
3244/// Render tier 0: full detail with callers, callees, and signatures.
3245fn render_tier0(graph: &RepoGraph, file_idx: usize, file: &FileNode, score: f32) -> String {
3246 let mut out = format!("## {} (rank: {score:.4})\n", file.path);
3247
3248 // Callers
3249 if file_idx < graph.callers.len() && !graph.callers[file_idx].is_empty() {
3250 let _ = write!(out, " called by: ");
3251 let names: Vec<&str> = graph.callers[file_idx]
3252 .iter()
3253 .filter_map(|&idx| graph.files.get(idx as usize).map(|f| f.path.as_str()))
3254 .collect();
3255 let _ = writeln!(out, "{}", names.join(", "));
3256 }
3257
3258 // Callees
3259 if file_idx < graph.callees.len() && !graph.callees[file_idx].is_empty() {
3260 let _ = write!(out, " calls: ");
3261 let names: Vec<&str> = graph.callees[file_idx]
3262 .iter()
3263 .filter_map(|&idx| graph.files.get(idx as usize).map(|f| f.path.as_str()))
3264 .collect();
3265 let _ = writeln!(out, "{}", names.join(", "));
3266 }
3267
3268 // Definitions with scope and signature
3269 for def in &file.defs {
3270 let scope_prefix = if def.scope.is_empty() {
3271 String::new()
3272 } else {
3273 format!("{} > ", def.scope)
3274 };
3275 if let Some(sig) = &def.signature {
3276 let _ = writeln!(out, " {scope_prefix}{} {sig}", def.kind);
3277 } else {
3278 let _ = writeln!(out, " {scope_prefix}{} {}", def.kind, def.name);
3279 }
3280 }
3281 let _ = writeln!(out);
3282 out
3283}
3284
3285/// Render tier 1: file path, rank, and signatures.
3286fn render_tier1(file: &FileNode, score: f32) -> String {
3287 let mut out = format!("## {} (rank: {score:.4})\n", file.path);
3288 for def in &file.defs {
3289 if let Some(sig) = &def.signature {
3290 let _ = writeln!(out, " {sig}");
3291 } else {
3292 let _ = writeln!(out, " {} {}", def.kind, def.name);
3293 }
3294 }
3295 let _ = writeln!(out);
3296 out
3297}
3298
3299/// Render tier 2: file path, rank, and definition names/kinds.
3300fn render_tier2(file: &FileNode, score: f32) -> String {
3301 let mut out = format!("{} (rank: {score:.4})", file.path);
3302 if !file.defs.is_empty() {
3303 let names: Vec<String> = file
3304 .defs
3305 .iter()
3306 .map(|d| format!("{}:{}", d.kind, d.name))
3307 .collect();
3308 let _ = write!(out, " -- {}", names.join(", "));
3309 }
3310 let _ = writeln!(out);
3311 out
3312}
3313
3314/// Render tier 3: file path only.
3315fn render_tier3(file: &FileNode) -> String {
3316 format!("{}\n", file.path)
3317}
3318
3319// ── JSON rendering ───────────────────────────────────────────────────
3320
3321/// Build the `lsp_location` for a file itself (line 0).
3322fn file_lsp_location(path: &str) -> RepoMapLspLocation {
3323 RepoMapLspLocation {
3324 file_path: if path.starts_with("./") || path.starts_with('/') {
3325 path.to_string()
3326 } else {
3327 format!("./{path}")
3328 },
3329 start_line: 0,
3330 start_character: 0,
3331 end_line: 0,
3332 end_character: 0,
3333 }
3334}
3335
3336/// Infer `ContentKind` from a file path's extension.
3337fn content_kind_for_path(path: &str) -> ContentKind {
3338 let ext = std::path::Path::new(path)
3339 .extension()
3340 .and_then(|e| e.to_str())
3341 .unwrap_or("");
3342 ContentKind::from_extension(ext)
3343}
3344
3345/// Minimum byte envelope reserved for each included file.
3346///
3347/// Even a file with zero symbols takes JSON overhead for path, rank, arrays,
3348/// etc. Calibrated against actual serde_json output for an empty `RepoMapFile`:
3349/// `{"lsp_location":{"file_path":"./src/file_N.rs","start_line":0,"start_character":0,`
3350/// `"end_line":0,"end_character":0},"rank":0.1234,"content_kind":"code",`
3351/// `"calls":[],"symbols":[],"truncated_symbols":0,"truncated_calls":0}` ≈ 250 bytes.
3352///
3353/// This floor prevents the budget allocator from giving a file so little space
3354/// that it can emit no envelope at all.
3355const FILE_ENVELOPE_MIN_BYTES: usize = 250;
3356
3357/// Minimum useful payload for an admitted file: envelope plus room for at
3358/// least 2-3 typical-sized symbols. Files whose fair share cannot meet this
3359/// floor are excluded entirely (Fix A, 4.0.2). Without this guard, low-rank
3360/// tail files consume budget on envelopes that contain no symbols or calls,
3361/// crowding out content for the top files.
3362const FILE_MIN_USEFUL_BYTES: usize = 600;
3363
3364/// Fraction of each file's per-file budget reserved for outgoing-call edges
3365/// after the envelope is paid. The remaining (1 - this) fraction goes to
3366/// symbols. Symbol leftover flows into calls; call leftover flows to the
3367/// next file. (Fix C, 4.0.2 — without a reserve, the symbol loop saturates
3368/// the per-file budget and calls always come up empty.)
3369const CALLS_BUDGET_FRACTION: f64 = 0.30;
3370
3371/// Maximum fraction of the total budget that a single file may claim.
3372///
3373/// Without this cap a single very-high-rank file (e.g. `lib.rs`) could
3374/// consume the entire budget, leaving all other files empty.
3375const MAX_FILE_SHARE: f64 = 0.40;
3376
3377/// AST kind priority for orientation-style symbol ordering. Higher = surface
3378/// earlier. Used when def-level PageRank is degenerate (most ranks near zero)
3379/// to fall back on structural signal rather than noise.
3380///
3381/// The intuition: a reader orienting in a codebase wants to see the file's
3382/// *shape* before its *behaviors*. Types declare shape; functions declare
3383/// behavior; fields and constants are internal detail. This ordering matches
3384/// how humans read code top-down. (Fix B, 4.0.2.)
3385fn ast_kind_priority(kind: &str) -> u32 {
3386 match kind {
3387 // Tier 3: shape — what THIS file is
3388 "trait_item" | "interface" | "trait" => 30,
3389 "struct_item" | "struct" | "class_definition" | "class" => 29,
3390 "enum_item" | "enum" => 28,
3391 "type_item" | "type_alias_declaration" | "type_alias" => 27,
3392 "mod_item" | "module" | "namespace" => 26,
3393 // Tier 2: behavior — what THIS file does
3394 "function_item" | "function_definition" | "function" | "method_definition" => 20,
3395 "impl_item" | "impl" => 19,
3396 // Tier 1: declarations
3397 "const_item" | "const_declaration" | "const" => 10,
3398 "static_item" | "static" => 9,
3399 // Tier 0: internals (fields, variables, parameters)
3400 _ => 0,
3401 }
3402}
3403
3404/// Effective AST priority with corpus-relative rank promotion (4.0.4).
3405///
3406/// Preserves the 4.0.2 AST-priority ordering by default (types first,
3407/// then functions, then fields). When a def's PageRank significantly
3408/// exceeds the corpus median, promotes it up one or two tiers so that
3409/// load-bearing defs surface alongside their declared-tier neighbors.
3410///
3411/// Thresholds are corpus-median multiples (self-calibrating):
3412/// - rank > 4× median → +1 tier (e.g., hot function joins type tier)
3413/// - rank > 16× median → +2 tiers (extremely hot def)
3414/// - otherwise → declared tier preserved
3415///
3416/// On degenerate (flat) rank distributions the median equals the floor,
3417/// nothing crosses threshold, and 4.0.2 AST-priority ordering is fully
3418/// preserved. On informative distributions (post-4.0.3 enrichment),
3419/// hot defs surface proportionally.
3420fn effective_priority(kind: &str, def_rank: f32, promo_1: f32, promo_2: f32) -> u32 {
3421 let base = ast_kind_priority(kind);
3422 // Accumulate promotion tiers as a plain integer to satisfy clippy's
3423 // bool_to_int_with_if lint while preserving branch clarity.
3424 let promo_tiers: u32 = u32::from(def_rank > promo_1) + u32::from(def_rank > promo_2);
3425 // Tier spacing matches ast_kind_priority's 10-unit gaps.
3426 base + promo_tiers * 10
3427}
3428
3429/// Estimate the serialised JSON byte cost of one `RepoMapSymbol`.
3430///
3431/// Calibrated against actual serde_json output. A `RepoMapSymbol` serialises to
3432/// approximately:
3433/// `{"name":"<N>","kind":<K>,"lsp_location":{"file_path":"<P>","start_line":0,`
3434/// `"start_character":0,"end_line":0,"end_character":0},"rank":<R>}`
3435///
3436/// That is ~165 bytes of overhead (braces, keys, fixed-width integers, rank)
3437/// plus the name length and file_path length. We pass the path length
3438/// separately because the path is the same for all symbols in one file.
3439fn estimate_symbol_bytes(name: &str) -> usize {
3440 // 165 bytes overhead + name length.
3441 // The file_path is not included here because it is part of the
3442 // envelope cost accounted separately.
3443 165 + name.len()
3444}
3445
3446/// Estimate the serialised JSON byte cost of one `RepoMapCall`.
3447///
3448/// Each call entry: `{"lsp_location":{"file_path":"<P>","start_line":0,`
3449/// `"start_character":0,"end_line":0,"end_character":0},"rank":<R>}`
3450/// ≈ 120 bytes overhead + path length.
3451fn estimate_call_bytes(target_path: &str) -> usize {
3452 120 + target_path.len()
3453}
3454
3455/// Render a `PageRank`-weighted JSON map with token-budget allocation (4.0.1).
3456///
3457/// # Algorithm
3458///
3459/// **Step 1 — File-share allocation.** Each eligible file receives a byte
3460/// budget proportional to its `base_rank`. The share is capped at 40% of
3461/// `budget_total_bytes` and floored at [`FILE_ENVELOPE_MIN_BYTES`] (200 B).
3462/// Files are included in rank order until the cumulative allocation would
3463/// exceed the total budget.
3464///
3465/// **Step 2 — Per-file symbol fill.** For each included file, symbols are
3466/// walked in def-rank descending order. Inclusion continues until either (a)
3467/// the file's budget share is exhausted (with carry-over of leftover bytes to
3468/// the next file) or (b) a logarithmic attenuation cutoff fires: symbol at
3469/// position `i` (0-based) is included only if its rank ≥ `top_rank /
3470/// (1 + ln(i + 1))`. The same algorithm fills `calls[]` in target-file
3471/// base-rank order. `truncated_symbols` and `truncated_calls` track the
3472/// count of omitted entries.
3473///
3474/// **Step 3 — Response telemetry.** The response includes `estimated_bytes`
3475/// (actual returned content size), `budget_bytes` (`token_budget * 4`),
3476/// and `budget_exhausted` (`total_files > files.len()`).
3477///
3478/// # Arguments
3479///
3480/// - `graph` — the built dependency graph.
3481/// - `token_budget` — caller-specified token budget (× 4 = byte budget).
3482/// - `focus` — optional file index for topic-sensitive `PageRank`.
3483/// - `include_metadata` — when `false` (default), Meta-classified files
3484/// are excluded before ranking.
3485#[must_use]
3486#[expect(
3487 clippy::cast_precision_loss,
3488 reason = "rank sums and counts are small f32/f64; precision loss is acceptable"
3489)]
3490#[expect(
3491 clippy::too_many_lines,
3492 reason = "the three-step allocation algorithm (file-share → symbol-fill → calls-fill) \
3493 is sequential and share state; splitting into helpers would require passing \
3494 mutable slices across three boundaries with no clarity gain"
3495)]
3496pub fn render_json_budgeted(
3497 graph: &RepoGraph,
3498 token_budget: usize,
3499 focus: Option<usize>,
3500 include_metadata: bool,
3501) -> GetRepoMapResponse {
3502 let n = graph.files.len();
3503 if n == 0 {
3504 let budget_bytes = token_budget * CHARS_PER_TOKEN;
3505 return GetRepoMapResponse {
3506 files: vec![],
3507 total_files: 0,
3508 estimated_bytes: 0,
3509 budget_bytes,
3510 budget_exhausted: false,
3511 capped: false,
3512 };
3513 }
3514
3515 let budget_total_bytes = token_budget * CHARS_PER_TOKEN;
3516
3517 // Recompute topic-sensitive ranks if focus is given.
3518 let ranks = if focus.is_some() {
3519 pagerank(n, &graph.edges, focus)
3520 } else {
3521 graph.base_ranks.clone()
3522 };
3523
3524 // Sort all file indices by rank descending.
3525 let mut sorted: Vec<usize> = (0..n).collect();
3526 sorted.sort_by(|&a, &b| ranks[b].total_cmp(&ranks[a]));
3527
3528 // Apply metadata exclusion filter.
3529 let eligible: Vec<usize> = if include_metadata {
3530 sorted
3531 } else {
3532 sorted
3533 .into_iter()
3534 .filter(|&idx| {
3535 let kind = content_kind_for_path(&graph.files[idx].path);
3536 kind != ContentKind::Meta
3537 })
3538 .collect()
3539 };
3540
3541 let total_files = eligible.len();
3542
3543 // ── Corpus-median def-rank thresholds for tier promotion (4.0.4) ────────
3544 //
3545 // Compute once per call (corpus-wide, not per-file) so the threshold is
3546 // self-calibrating: flat distributions (all ranks equal) set median = floor
3547 // and nothing crosses threshold; informative distributions see proportional
3548 // promotion. Using corpus-wide median ensures a hot function in one file is
3549 // judged against the entire corpus, not just its local file peers.
3550 // Use the 75th percentile of nonzero def-ranks as the corpus reference value
3551 // for tier promotion (rather than the 50th percentile / median). The 75th
3552 // percentile is more robust: on a flat distribution most defs cluster near the
3553 // floor, so the 75th percentile is only marginally above the floor (making the
3554 // 4× threshold very selective). On an informative distribution (post-4.0.3
3555 // call-edge enrichment) the 75th percentile is meaningfully above the floor,
3556 // so the same 4× multiplier captures genuinely hot defs without falsely
3557 // promoting slightly-above-floor helpers.
3558 //
3559 // The 50th percentile (lower median) was rejected because on a 10-def corpus
3560 // with max/min ratio 5× the median equals the floor, causing the 4× threshold
3561 // to fire on defs that are only 5× above floor (a low-variance corpus). The
3562 // 75th percentile corrects this without requiring hand-tuned per-corpus magic
3563 // numbers.
3564 let corpus_reference_rank: f32 = {
3565 let mut nonzero: Vec<f32> = graph
3566 .def_ranks
3567 .iter()
3568 .copied()
3569 .filter(|r| *r > 0.0)
3570 .collect();
3571 if nonzero.is_empty() {
3572 0.0
3573 } else {
3574 nonzero.sort_unstable_by(f32::total_cmp);
3575 let n = nonzero.len();
3576 // 75th percentile index: floor(0.75 * (n - 1))
3577 let idx = (3 * (n - 1)) / 4;
3578 nonzero[idx]
3579 }
3580 };
3581 let promo_1_threshold = corpus_reference_rank * 4.0; // +1 tier
3582 let promo_2_threshold = corpus_reference_rank * 16.0; // +2 tiers
3583
3584 // ── Step 1: File-share allocation ────────────────────────────────
3585
3586 // Greedily determine which files fit within the budget, computing each
3587 // file's share as it is added. We must run a two-pass approach:
3588 // pass A: determine which files are included (cumulative sum check),
3589 // pass B: fill symbols/calls using final per-file allocations.
3590 //
3591 // The "included" decision is based on the running cumulative sum so that
3592 // the leftover redistribution in step 2 can carry forward correctly.
3593
3594 // Floor-first admission (Fix A, 4.0.2):
3595 //
3596 // Cap admitted file count so each gets at least FILE_MIN_USEFUL_BYTES.
3597 // Below this threshold the response would carry envelopes that contain
3598 // no symbols or calls — pure overhead, no information. Concentrating
3599 // the budget on fewer files with real content is strictly better for
3600 // orientation than dropping envelope sentinels for many files.
3601 let max_admissible = budget_total_bytes / FILE_MIN_USEFUL_BYTES;
3602 let admit_count = eligible.len().min(max_admissible.max(1));
3603
3604 let budget_f64 = budget_total_bytes as f64;
3605
3606 // Pre-compute rank sum across ADMITTED files only (top-N by rank). f64
3607 // to avoid precision loss when summing many small f32 values.
3608 let admitted_rank_sum: f64 = eligible
3609 .iter()
3610 .take(admit_count)
3611 .map(|&idx| f64::from(ranks[idx]))
3612 .sum();
3613 let admitted_rank_sum = if admitted_rank_sum > 0.0 {
3614 admitted_rank_sum
3615 } else {
3616 1.0
3617 };
3618
3619 // Compute per-file budgets. Each admitted file gets at least
3620 // FILE_MIN_USEFUL_BYTES; the proportional-to-rank share is applied on
3621 // top of the floor and capped at MAX_FILE_SHARE.
3622 let mut included_indices: Vec<usize> = Vec::new(); // indices into `eligible`
3623 let mut file_budgets: Vec<usize> = Vec::new();
3624 let mut cumulative_budget: usize = 0;
3625
3626 for (i, &file_idx) in eligible.iter().take(admit_count).enumerate() {
3627 let file_rank = f64::from(ranks[file_idx]);
3628 let raw_share = budget_f64 * file_rank / admitted_rank_sum;
3629 let capped = raw_share.min(budget_f64 * MAX_FILE_SHARE);
3630 // `capped` is non-negative and bounded by budget_f64 (a usize).
3631 #[expect(
3632 clippy::cast_possible_truncation,
3633 clippy::cast_sign_loss,
3634 reason = "capped is non-negative and bounded by budget_total_bytes (a usize)"
3635 )]
3636 let budget_i = (capped as usize).max(FILE_MIN_USEFUL_BYTES);
3637
3638 if cumulative_budget + budget_i > budget_total_bytes && !included_indices.is_empty() {
3639 break;
3640 }
3641 cumulative_budget += budget_i;
3642 included_indices.push(i);
3643 file_budgets.push(budget_i);
3644 }
3645
3646 // ── Step 2: Per-file symbol fill ─────────────────────────────────
3647
3648 let mut result_files: Vec<RepoMapFile> = Vec::with_capacity(included_indices.len());
3649 let mut leftover: usize = 0; // unused bytes carried from previous file
3650
3651 for (slot, &eligible_i) in included_indices.iter().enumerate() {
3652 let file_idx = eligible[eligible_i];
3653 let file = &graph.files[file_idx];
3654 let file_rank = ranks[file_idx];
3655 let file_path_lsp = file_lsp_location(&file.path);
3656
3657 let budget_in = file_budgets[slot] + leftover;
3658
3659 // Reserve a fraction of the post-envelope budget for outgoing calls
3660 // (Fix C, 4.0.2). Without this guard the symbol loop saturates
3661 // `budget_in` and the calls loop always trips its byte-check.
3662 // Symbol leftover flows into calls; call leftover flows to the
3663 // next file via the outer `leftover` variable.
3664 let post_envelope = budget_in.saturating_sub(FILE_ENVELOPE_MIN_BYTES);
3665 #[expect(
3666 clippy::cast_possible_truncation,
3667 clippy::cast_sign_loss,
3668 reason = "post_envelope * fraction is bounded by post_envelope (a usize)"
3669 )]
3670 let calls_reserve = (post_envelope as f64 * CALLS_BUDGET_FRACTION) as usize;
3671 let symbols_budget = FILE_ENVELOPE_MIN_BYTES + post_envelope.saturating_sub(calls_reserve);
3672 let mut used: usize = FILE_ENVELOPE_MIN_BYTES; // envelope cost
3673
3674 // ── Symbols ──────────────────────────────────────────────────
3675 // Retrieve def-level ranks for this file via the offset table.
3676 let def_count = file.defs.len();
3677 let def_offset = if file_idx < graph.def_offsets.len() {
3678 graph.def_offsets[file_idx]
3679 } else {
3680 0
3681 };
3682
3683 // Build (def_idx, rank, kind_priority, start_byte) tuples. We sort
3684 // by a composite key: AST kind priority (descending) — putting types
3685 // before functions before fields — then by def_rank (descending)
3686 // within each tier. This is Fix B (4.0.2): the def_rank distribution
3687 // is often degenerate (most defs share near-zero rank because the
3688 // call-edge extractor doesn't capture every dispatch), so we use
3689 // structural signal as the primary ordering and def_rank as the
3690 // within-tier tiebreaker. When def_rank IS informative, it dominates
3691 // *within* its kind tier and recovers the original behavior; the AST
3692 // signal only shifts ordering *between* tiers.
3693 let mut def_rank_pairs: Vec<(usize, f32, u32, u32)> = (0..def_count)
3694 .map(|di| {
3695 let flat = def_offset + di;
3696 let r = graph.def_ranks.get(flat).copied().unwrap_or(0.0);
3697 // Store the ORIGINAL ast_kind_priority in the tuple (used by the
3698 // per-tier attenuation loop below). The sort comparator uses
3699 // effective_priority (which may be higher due to 4.0.4 promotion)
3700 // to reorder hot defs ahead of cold type-tier defs, while the
3701 // attenuation tier tracker continues to use the original AST tier
3702 // so the existing per-tier cutoff behaviour is preserved.
3703 let kind_prio = ast_kind_priority(&file.defs[di].kind);
3704 let decl_order = file.defs[di].start_byte;
3705 (di, r, kind_prio, decl_order)
3706 })
3707 .collect();
3708 def_rank_pairs.sort_unstable_by(|a, b| {
3709 // Primary: effective priority (4.0.4: AST kind + corpus-rank promotion) descending.
3710 // Hot defs that exceed corpus-median thresholds are promoted above their
3711 // declared tier so they surface before cold type-tier defs.
3712 let eff_a = effective_priority(
3713 &file.defs[a.0].kind,
3714 a.1,
3715 promo_1_threshold,
3716 promo_2_threshold,
3717 );
3718 let eff_b = effective_priority(
3719 &file.defs[b.0].kind,
3720 b.1,
3721 promo_1_threshold,
3722 promo_2_threshold,
3723 );
3724 eff_b
3725 .cmp(&eff_a)
3726 // Secondary: def_rank descending within tier.
3727 .then_with(|| b.1.total_cmp(&a.1))
3728 // Tertiary: earlier declaration order (stable, deterministic).
3729 .then_with(|| a.3.cmp(&b.3))
3730 });
3731
3732 let top_def_rank = def_rank_pairs.first().map(|&(_, r, _, _)| r).unwrap_or(0.0);
3733
3734 let mut symbols: Vec<RepoMapSymbol> = Vec::new();
3735 let mut truncated_symbols: usize = 0;
3736
3737 // Track per-tier position for the attenuation cutoff. When AST kind
3738 // priority changes (we've moved from types to functions, say), reset
3739 // the position so the attenuation curve restarts. Otherwise a
3740 // structurally-equivalent-but-later tier would be unfairly cut.
3741 let mut tier_pos: usize = 0;
3742 let mut current_tier: Option<u32> = None;
3743 let mut tier_top_rank: f32 = top_def_rank;
3744
3745 for (pos, &(di, def_r, kind_prio, _)) in def_rank_pairs.iter().enumerate() {
3746 // Reset attenuation at tier boundaries.
3747 if current_tier != Some(kind_prio) {
3748 current_tier = Some(kind_prio);
3749 tier_pos = 0;
3750 tier_top_rank = def_r;
3751 }
3752
3753 // Logarithmic attenuation cutoff, relative to the tier's top rank.
3754 let cutoff = if tier_top_rank > 0.0 {
3755 tier_top_rank / (1.0 + (tier_pos as f32 + 1.0).ln())
3756 } else {
3757 0.0
3758 };
3759 if def_r < cutoff {
3760 // Attenuation cuts the rest of THIS tier; we don't stop
3761 // entirely because the next tier may still have useful
3762 // content within its own attenuation curve. Skip this def.
3763 truncated_symbols += 1;
3764 tier_pos += 1;
3765 continue;
3766 }
3767
3768 let def = &file.defs[di];
3769 let sym_bytes = estimate_symbol_bytes(&def.name);
3770 // Use the reserved symbols sub-budget (Fix C) so calls aren't
3771 // starved when symbols would otherwise saturate budget_in.
3772 if used + sym_bytes > symbols_budget {
3773 truncated_symbols += def_rank_pairs.len() - pos;
3774 break;
3775 }
3776
3777 let kind = crate::languages::lsp_symbol_kind_for_node_kind(&def.kind);
3778 let line_0 = def.start_line.saturating_sub(1) as usize;
3779 symbols.push(RepoMapSymbol {
3780 name: def.name.clone(),
3781 kind,
3782 lsp_location: RepoMapLspLocation {
3783 file_path: file_path_lsp.file_path.clone(),
3784 start_line: line_0,
3785 start_character: 0,
3786 end_line: line_0,
3787 end_character: 0,
3788 },
3789 rank: def_r,
3790 });
3791 used += sym_bytes;
3792 tier_pos += 1;
3793 }
3794
3795 // ── Calls ─────────────────────────────────────────────────────
3796 // Gather outgoing callees sorted by target file base_rank descending.
3797 let callee_indices: Vec<usize> = if file_idx < graph.callees.len() {
3798 let mut callees: Vec<(usize, f32)> = graph.callees[file_idx]
3799 .iter()
3800 .filter_map(|&ci| {
3801 let ci = ci as usize;
3802 graph.files.get(ci).map(|_| {
3803 let r = graph.base_ranks.get(ci).copied().unwrap_or(0.0);
3804 (ci, r)
3805 })
3806 })
3807 .collect();
3808 callees.sort_unstable_by(|a, b| b.1.total_cmp(&a.1));
3809 callees.into_iter().map(|(ci, _)| ci).collect()
3810 } else {
3811 vec![]
3812 };
3813
3814 let call_total = callee_indices.len();
3815 let top_call_rank = callee_indices
3816 .first()
3817 .and_then(|&ci| graph.base_ranks.get(ci))
3818 .copied()
3819 .unwrap_or(0.0);
3820
3821 let mut calls: Vec<RepoMapCall> = Vec::new();
3822 let mut truncated_calls: usize = 0;
3823
3824 for (pos, &ci) in callee_indices.iter().enumerate() {
3825 let callee_rank = graph.base_ranks.get(ci).copied().unwrap_or(0.0);
3826
3827 // Logarithmic attenuation cutoff on target rank.
3828 let cutoff = if top_call_rank > 0.0 {
3829 top_call_rank / (1.0 + (pos as f32 + 1.0).ln())
3830 } else {
3831 0.0
3832 };
3833 if callee_rank < cutoff {
3834 truncated_calls += call_total - pos;
3835 break;
3836 }
3837
3838 let callee_path = &graph.files[ci].path;
3839 let call_bytes = estimate_call_bytes(callee_path);
3840 if used + call_bytes > budget_in {
3841 truncated_calls += call_total - pos;
3842 break;
3843 }
3844
3845 calls.push(RepoMapCall {
3846 lsp_location: file_lsp_location(callee_path),
3847 rank: callee_rank,
3848 });
3849 used += call_bytes;
3850 }
3851
3852 // Carry unused bytes forward to the next file.
3853 leftover = budget_in.saturating_sub(used);
3854
3855 result_files.push(RepoMapFile {
3856 lsp_location: file_path_lsp,
3857 rank: file_rank,
3858 content_kind: content_kind_tag(content_kind_for_path(&file.path)),
3859 calls,
3860 symbols,
3861 truncated_symbols,
3862 truncated_calls,
3863 });
3864 }
3865
3866 let estimated_bytes = serde_json::to_string(&result_files)
3867 .map(|s| s.len())
3868 .unwrap_or(0);
3869
3870 let budget_exhausted = total_files > result_files.len();
3871
3872 GetRepoMapResponse {
3873 files: result_files,
3874 total_files,
3875 estimated_bytes,
3876 budget_bytes: budget_total_bytes,
3877 budget_exhausted,
3878 capped: budget_exhausted,
3879 }
3880}
3881
3882/// Render a `PageRank`-sorted JSON map of the repository (4.0.0 compatibility shim).
3883///
3884/// This function wraps [`render_json_budgeted`] with a synthetic token budget
3885/// derived from `max_files * 2000` (a generous per-file allowance). It exists
3886/// to keep the existing D1/D2 unit tests compiling without change; the MCP
3887/// layer calls [`render_json_budgeted`] directly in 4.0.1.
3888///
3889/// The `capped` field in the response reflects whether the budget was
3890/// exhausted before all `eligible` files were included, which is equivalent
3891/// to the previous `total_files > max_files` check.
3892///
3893/// When `include_metadata` is `false` (default), files whose extension
3894/// classifies as [`ContentKind::Meta`] are excluded before ranking.
3895#[must_use]
3896pub fn render_json(
3897 graph: &RepoGraph,
3898 max_files: usize,
3899 focus: Option<usize>,
3900 include_metadata: bool,
3901) -> GetRepoMapResponse {
3902 // Synthesise a generous token budget: 2000 tokens per requested file.
3903 // This ensures the existing D1/D2 tests (which pass small max_files values
3904 // like 3, 5, 50) see the same file-count behaviour they expect. The test
3905 // assertions check file counts, not byte sizes, so the exact budget value
3906 // only matters for ensuring enough headroom.
3907 let token_budget = max_files.saturating_mul(2000);
3908 render_json_budgeted(graph, token_budget, focus, include_metadata)
3909}
3910
3911// ── Tests ────────────────────────────────────────────────────────────
3912
3913#[cfg(test)]
3914mod tests {
3915 use super::*;
3916
3917 #[test]
3918 fn test_pagerank_simple() {
3919 // 3-node graph: 0 -> 1 -> 2, 2 -> 0 (cycle)
3920 let edges = vec![(0, 1, 1), (1, 2, 1), (2, 0, 1)];
3921 let ranks = pagerank(3, &edges, None);
3922
3923 // All nodes in a symmetric cycle should have equal rank
3924 assert_eq!(ranks.len(), 3);
3925 let sum: f32 = ranks.iter().sum();
3926 assert!(
3927 (sum - 1.0).abs() < 0.01,
3928 "ranks should sum to ~1.0, got {sum}"
3929 );
3930
3931 // In a perfect cycle, all ranks should be approximately equal
3932 let expected = 1.0 / 3.0;
3933 for (i, &r) in ranks.iter().enumerate() {
3934 assert!(
3935 (r - expected).abs() < 0.05,
3936 "rank[{i}] = {r}, expected ~{expected}"
3937 );
3938 }
3939 }
3940
3941 #[test]
3942 fn test_pagerank_star() {
3943 // Star graph: 0,1,2 all point to 3
3944 let edges = vec![(0, 3, 1), (1, 3, 1), (2, 3, 1)];
3945 let ranks = pagerank(4, &edges, None);
3946
3947 assert_eq!(ranks.len(), 4);
3948 // Node 3 should have the highest rank
3949 let max_idx = ranks
3950 .iter()
3951 .enumerate()
3952 .max_by(|a, b| a.1.total_cmp(b.1))
3953 .unwrap()
3954 .0;
3955 assert_eq!(max_idx, 3, "node 3 should have highest rank");
3956 assert!(
3957 ranks[3] > ranks[0],
3958 "rank[3]={} should be > rank[0]={}",
3959 ranks[3],
3960 ranks[0]
3961 );
3962 }
3963
3964 #[test]
3965 fn test_pagerank_topic_sensitive() {
3966 // 10-node chain: 0 -> 1 -> ... -> 9.
3967 //
3968 // With PERSONALIZATION_ALPHA = 0.15 and n = 10, the uniform share per
3969 // node is 1/10 = 0.10. The focus node (0) gets 0.15 teleportation
3970 // mass vs 0.10 uniform, so focused rank[0] > uniform rank[0] holds.
3971 //
3972 // The 3-node chain used previously broke when alpha was reduced from
3973 // 0.70 to 0.15 because 0.15 < 1/3 = 0.33 for n=3 — the focus node
3974 // received *less* teleportation than its uniform share, inverting the
3975 // expected direction. Using n=10 avoids this edge case while still
3976 // testing the personalization effect.
3977 let n = 10_usize;
3978 #[expect(clippy::cast_possible_truncation, reason = "test: n << u32::MAX")]
3979 let edges: Vec<(u32, u32, u32)> = (0..(n - 1))
3980 .map(|i| (i as u32, (i + 1) as u32, 1_u32))
3981 .collect();
3982 let uniform_ranks = pagerank(n, &edges, None);
3983 let biased_ranks = pagerank(n, &edges, Some(0));
3984
3985 // With focus on node 0, it should get a higher rank than uniform
3986 // because PERSONALIZATION_ALPHA (0.15) > 1/n (0.10) for n=10.
3987 assert!(
3988 biased_ranks[0] > uniform_ranks[0],
3989 "focused rank[0]={} should be > uniform rank[0]={}",
3990 biased_ranks[0],
3991 uniform_ranks[0]
3992 );
3993 }
3994
3995 // ── J1 tests — topic-sensitive PageRank soft personalization ─────────
3996
3997 /// J1 RED: `focus_file` PageRank must not collapse other-file ranks.
3998 ///
3999 /// Baseline (pre-4.0.5) concentrated 70% mass on the focus node, producing
4000 /// a degenerate Dirac delta: focus rank ≈ 0.703, all others ≈ 0.003.
4001 /// This test fails on the baseline and must pass after the fix.
4002 ///
4003 /// Invariant: with `PERSONALIZATION_ALPHA = 0.15`, focus node gets 0.15 of
4004 /// teleportation mass and each of the other (n-1) nodes gets 0.85/(n-1).
4005 /// On a star graph with n=10 nodes, the focus node rank must NOT be more
4006 /// than 40× the average non-focus rank. The 4.0.5 fix targets roughly
4007 /// 5-10× for a well-connected graph, so 40× is a conservative upper bound
4008 /// that the baseline (≈200×) fails.
4009 #[test]
4010 fn test_focus_file_topic_pagerank_preserves_rank_dispersion() {
4011 // Star graph: nodes 1..9 all point to node 0 (high natural rank).
4012 // Focus on node 1 (low natural rank) to test personalization effect.
4013 let n = 10_usize;
4014 #[expect(clippy::cast_possible_truncation, reason = "test: n << u32::MAX")]
4015 let edges: Vec<(u32, u32, u32)> = (1..n).map(|i| (i as u32, 0_u32, 1_u32)).collect();
4016
4017 let ranks_focused = pagerank(n, &edges, Some(1));
4018
4019 let focus_rank = ranks_focused[1];
4020 let sum_non_focus: f32 = ranks_focused
4021 .iter()
4022 .enumerate()
4023 .filter(|&(i, _)| i != 1)
4024 .map(|(_, &r)| r)
4025 .sum();
4026 let n_non_focus = (n - 1) as f32;
4027 let avg_non_focus = sum_non_focus / n_non_focus;
4028
4029 let dispersion_ratio = focus_rank / avg_non_focus;
4030
4031 eprintln!(
4032 "J1 dispersion: focus_rank={focus_rank:.6}, avg_non_focus={avg_non_focus:.6}, \
4033 ratio={dispersion_ratio:.2}× (must be <= 40×)"
4034 );
4035
4036 // With 0.15 personalization alpha the focus node's teleportation
4037 // advantage is modest; 40× is an upper bound the old 0.70 code violates.
4038 assert!(
4039 dispersion_ratio <= 40.0,
4040 "focus rank is {dispersion_ratio:.1}× avg non-focus rank (must be ≤ 40×); \
4041 pre-fix baseline was ~200× due to 70% concentration — I#16"
4042 );
4043
4044 // Ranks must still sum to ~1.
4045 let total: f32 = ranks_focused.iter().sum();
4046 assert!(
4047 (total - 1.0).abs() < 0.01,
4048 "ranks must sum to ≈1.0; got {total}"
4049 );
4050 }
4051
4052 /// J1 RED: focus node must have the highest rank (it still gets the bias),
4053 /// but non-focus nodes must NOT collapse to a flat floor.
4054 ///
4055 /// Concretely: the second-highest-ranked file must be ≥ 10% of the focus
4056 /// file's rank (neighborhood rebiasing, not winner-take-all).
4057 #[test]
4058 fn test_focus_file_topic_pagerank_does_not_collapse_other_files() {
4059 // Linear chain: 0 → 1 → 2 → ... → 9 (directed).
4060 // Focus on node 0. Without personalization, ranks decrease along the
4061 // chain. With soft personalization the non-focus nodes stay non-trivial.
4062 let n = 10_usize;
4063 #[expect(clippy::cast_possible_truncation, reason = "test: n << u32::MAX")]
4064 let edges: Vec<(u32, u32, u32)> = (0..(n - 1))
4065 .map(|i| (i as u32, (i + 1) as u32, 1_u32))
4066 .collect();
4067
4068 let ranks = pagerank(n, &edges, Some(0));
4069
4070 let focus_rank = ranks[0];
4071 // All non-focus ranks must be ≥ 10% of focus rank.
4072 for (i, &r) in ranks.iter().enumerate().skip(1) {
4073 assert!(
4074 r >= focus_rank * 0.10,
4075 "rank[{i}] = {r:.6} is < 10% of focus rank {focus_rank:.6}; \
4076 non-focus files must not collapse to near-zero (I#16)"
4077 );
4078 }
4079 }
4080
4081 // ── J2 tests — neighborhood count parity ─────────────────────────────
4082
4083 /// J2 RED: `render_json_budgeted` with `focus=Some(i)` must return at
4084 /// least 80% as many files as the unfocused call with the same budget.
4085 ///
4086 /// Baseline collapses the focused run to 1 dominant file + a flat tail
4087 /// that may still appear, but the intent is no budget waste on zero-signal
4088 /// entries. With soft personalization the focused call should return a
4089 /// similar file count to the unfocused call (within ±20%).
4090 #[test]
4091 fn test_focus_file_returns_neighborhood_not_just_focus() {
4092 // Build a 12-file star graph with meaningful rank variation.
4093 let n = 12_usize;
4094 #[expect(clippy::cast_possible_truncation, reason = "test: n << u32::MAX")]
4095 let edges: Vec<(u32, u32, u32)> = (1..n).map(|i| (i as u32, 0_u32, 1_u32)).collect();
4096 let base_ranks = pagerank(n, &edges, None);
4097 let (callers, callees) = build_neighbor_lists(n, &edges);
4098
4099 let file_nodes: Vec<FileNode> = (0..n)
4100 .map(|i| FileNode {
4101 path: format!("src/file_{i}.rs"),
4102 defs: vec![Definition {
4103 name: format!("func_{i}"),
4104 kind: "function_item".to_string(),
4105 start_line: 1,
4106 end_line: 5,
4107 scope: String::new(),
4108 signature: Some(format!("fn func_{i}() -> i32")),
4109 start_byte: 0,
4110 end_byte: 100,
4111 calls: vec![],
4112 }],
4113 imports: vec![],
4114 })
4115 .collect();
4116
4117 let graph = RepoGraph {
4118 files: file_nodes,
4119 edges,
4120 base_ranks,
4121 callers,
4122 callees,
4123 def_edges: vec![],
4124 def_ranks: vec![],
4125 def_callers: vec![],
4126 def_callees: vec![],
4127 def_offsets: vec![0],
4128 alpha: 0.5,
4129 };
4130
4131 let budget = 2000; // generous budget; all 12 files should fit
4132 let unfocused = render_json_budgeted(&graph, budget, None, false);
4133 let focused = render_json_budgeted(&graph, budget, Some(1), false);
4134
4135 let unfocused_n = unfocused.files.len();
4136 let focused_n = focused.files.len();
4137 #[expect(
4138 clippy::cast_possible_truncation,
4139 clippy::cast_sign_loss,
4140 reason = "unfocused_n is a file count (small, positive); f32 multiplication \
4141 by 0.80 and ceil produce a value in [0, n]; truncation to usize is safe"
4142 )]
4143 let min_expected = (unfocused_n as f32 * 0.80).ceil() as usize;
4144
4145 eprintln!(
4146 "J2 neighborhood: unfocused={unfocused_n} files, focused={focused_n} files \
4147 (need ≥ {min_expected})"
4148 );
4149
4150 assert!(
4151 focused_n >= min_expected,
4152 "focused call returned {focused_n} files; expected ≥ {min_expected} \
4153 (80% of unfocused {unfocused_n}); soft personalization must preserve \
4154 rank dispersion across files (I#16/J2)"
4155 );
4156 }
4157
4158 /// J2 RED: topic delta fingerprinting — focused run must reorder files
4159 /// relative to unfocused run (focus file surfaces near top), but both
4160 /// must contain similar total file counts.
4161 #[test]
4162 fn test_focus_delta_topic_fingerprinting_works() {
4163 // Bidirectional 8-file ring so all nodes are structurally equivalent.
4164 // Without focus all ranks are equal. With focus on node 3, node 3
4165 // must surface as the highest-ranked file.
4166 let n = 8_usize;
4167 #[expect(clippy::cast_possible_truncation, reason = "test: n << u32::MAX")]
4168 let edges: Vec<(u32, u32, u32)> = (0..n)
4169 .flat_map(|i| {
4170 let next = ((i + 1) % n) as u32;
4171 let curr = i as u32;
4172 [(curr, next, 1_u32), (next, curr, 1_u32)]
4173 })
4174 .collect();
4175
4176 let ranks_uniform = pagerank(n, &edges, None);
4177 let ranks_focused = pagerank(n, &edges, Some(3));
4178
4179 // Focus node must have highest rank.
4180 let top_idx = ranks_focused
4181 .iter()
4182 .enumerate()
4183 .max_by(|a, b| a.1.total_cmp(b.1))
4184 .map(|(i, _)| i)
4185 .unwrap();
4186
4187 assert_eq!(
4188 top_idx, 3,
4189 "with focus=Some(3), node 3 must have highest rank; top was {top_idx}"
4190 );
4191
4192 // Uniform baseline: all ranks should be approximately equal.
4193 let uniform_max = ranks_uniform
4194 .iter()
4195 .copied()
4196 .fold(f32::NEG_INFINITY, f32::max);
4197 let uniform_min = ranks_uniform.iter().copied().fold(f32::INFINITY, f32::min);
4198 assert!(
4199 (uniform_max - uniform_min).abs() < 0.01,
4200 "on a ring without focus all ranks should be ≈equal; max={uniform_max:.6} min={uniform_min:.6}"
4201 );
4202
4203 // Focused run must rank the focus node significantly higher than others
4204 // but others must remain non-trivial (≥ 5% of focus).
4205 let focus_rank = ranks_focused[3];
4206 for (i, &r) in ranks_focused.iter().enumerate().filter(|&(i, _)| i != 3) {
4207 assert!(
4208 r >= focus_rank * 0.05,
4209 "rank[{i}]={r:.6} is < 5% of focus rank {focus_rank:.6}; \
4210 soft personalization must preserve non-focus ranks"
4211 );
4212 }
4213 }
4214
4215 // ── T1 tests — focus_file resolver normalization (I#20) ──────────────
4216
4217 /// Build a tiny synthetic graph whose `FileNode::path` values match the
4218 /// shape `build_graph` produces on disk (no leading `./`, forward slashes).
4219 fn focus_resolver_graph() -> RepoGraph {
4220 let file_nodes: Vec<FileNode> = vec![
4221 FileNode {
4222 path: "device_opt/services/storage.py".to_string(),
4223 defs: vec![],
4224 imports: vec![],
4225 },
4226 FileNode {
4227 path: "device_opt/ui/textual/screens/settings.py".to_string(),
4228 defs: vec![],
4229 imports: vec![],
4230 },
4231 FileNode {
4232 path: "device_opt/services/registry.py".to_string(),
4233 defs: vec![],
4234 imports: vec![],
4235 },
4236 FileNode {
4237 path: "tests/test_storage.py".to_string(),
4238 defs: vec![],
4239 imports: vec![],
4240 },
4241 ];
4242 let n = file_nodes.len();
4243 RepoGraph {
4244 files: file_nodes,
4245 edges: vec![],
4246 base_ranks: vec![1.0 / n as f32; n],
4247 callers: vec![vec![]; n],
4248 callees: vec![vec![]; n],
4249 def_edges: vec![],
4250 def_ranks: vec![],
4251 def_callers: vec![],
4252 def_callees: vec![],
4253 def_offsets: vec![0; n + 1],
4254 alpha: 0.5,
4255 }
4256 }
4257
4258 /// T1: focus_file paths emitted by `lsp_location.file_path` (with the
4259 /// `./` prefix) must resolve to the correct file index.
4260 ///
4261 /// Baseline reproduction (mnemosyne corpus, 4.0.5): passing
4262 /// `focus_file="./device_opt/.../settings.py"` produced rank values
4263 /// bit-identical to the unfocused call because the strict-suffix matcher
4264 /// in `tools.rs` failed both the `exact` and the `strict_suffix` checks
4265 /// when the focus carried the LSP `./` prefix. The matcher silently
4266 /// returned `focus = None`, masking the failure as "topic-sensitive
4267 /// PageRank does nothing on Python".
4268 #[test]
4269 fn test_focus_file_resolver_accepts_lsp_location_path() {
4270 let g = focus_resolver_graph();
4271 // LSP-shaped path with leading `./` — the form documented in
4272 // get_repo_map's instructions.
4273 let res = g.resolve_focus_file("./device_opt/ui/textual/screens/settings.py");
4274 match res {
4275 FocusResolution::Found(idx) => {
4276 assert_eq!(
4277 g.files[idx].path, "device_opt/ui/textual/screens/settings.py",
4278 "resolver must accept the ./-prefixed LSP path form (I#20)"
4279 );
4280 }
4281 FocusResolution::NotFound | FocusResolution::Ambiguous(_) => {
4282 panic!(
4283 "resolver returned {res:?} for ./device_opt/ui/textual/screens/settings.py; \
4284 the LSP-shaped path form must resolve to exactly one file (I#20)"
4285 );
4286 }
4287 }
4288 }
4289
4290 /// T1: the bare stored path (no `./`) must continue to resolve.
4291 /// Regression guard for the pre-fix matcher's "exact" path.
4292 #[test]
4293 fn test_focus_file_resolver_accepts_bare_stored_path() {
4294 let g = focus_resolver_graph();
4295 let res = g.resolve_focus_file("device_opt/services/storage.py");
4296 match res {
4297 FocusResolution::Found(idx) => {
4298 assert_eq!(g.files[idx].path, "device_opt/services/storage.py");
4299 }
4300 other => panic!("expected Found, got {other:?}"),
4301 }
4302 }
4303
4304 /// T1: strict-suffix match — `storage.py` must match
4305 /// `device_opt/services/storage.py` (prev char is `/`) but ambiguity
4306 /// (two `storage.py` files) must be reported, not silently picked.
4307 #[test]
4308 fn test_focus_file_resolver_strict_suffix_and_ambiguity() {
4309 let g = focus_resolver_graph();
4310 // "storage.py" matches both device_opt/services/storage.py and
4311 // tests/test_storage.py? No — test_storage.py has `_` before `storage.py`
4312 // (not `/`), so the strict-suffix matcher rejects it. Only one match.
4313 let res = g.resolve_focus_file("storage.py");
4314 assert!(
4315 matches!(res, FocusResolution::Found(_)),
4316 "strict-suffix `storage.py` must match exactly one file (the `_` in \
4317 test_storage.py blocks the strict-suffix), got {res:?}"
4318 );
4319 // Add a second services/storage.py-shaped file to force ambiguity.
4320 let mut g2 = g.clone();
4321 g2.files.push(FileNode {
4322 path: "vendored/services/storage.py".to_string(),
4323 defs: vec![],
4324 imports: vec![],
4325 });
4326 g2.base_ranks.push(0.0);
4327 g2.callers.push(vec![]);
4328 g2.callees.push(vec![]);
4329 g2.def_offsets.push(*g2.def_offsets.last().unwrap());
4330 let res = g2.resolve_focus_file("storage.py");
4331 match res {
4332 FocusResolution::Ambiguous(cands) => {
4333 assert_eq!(cands.len(), 2, "expected two candidates, got {cands:?}");
4334 }
4335 other => panic!("expected Ambiguous, got {other:?}"),
4336 }
4337 }
4338
4339 /// T1: a focus that matches no file returns `NotFound`. The caller
4340 /// is responsible for either treating this as unfocused or surfacing
4341 /// an error — the resolver itself does not impose policy.
4342 #[test]
4343 fn test_focus_file_resolver_not_found() {
4344 let g = focus_resolver_graph();
4345 let res = g.resolve_focus_file("./does/not/exist.py");
4346 assert!(
4347 matches!(res, FocusResolution::NotFound),
4348 "expected NotFound, got {res:?}"
4349 );
4350 }
4351
4352 /// T1: empty focus does not match anything (avoids the empty-suffix
4353 /// degenerate that would otherwise match every file).
4354 #[test]
4355 fn test_focus_file_resolver_empty_input_is_not_found() {
4356 let g = focus_resolver_graph();
4357 let res = g.resolve_focus_file("");
4358 assert!(
4359 matches!(res, FocusResolution::NotFound),
4360 "empty focus must not match anything, got {res:?}"
4361 );
4362 }
4363
4364 /// T1: focus_file rank delta must be visible on a Python-shaped
4365 /// synthetic graph.
4366 ///
4367 /// Builds a small Python-style call graph (FileNode + Definition with
4368 /// resolved CallRefs, matching what `extract_calls` produces on a real
4369 /// Python corpus), runs `build_graph_from_files_pub` to get a
4370 /// `RepoGraph`, then calls `render_json_budgeted` with and without
4371 /// focus. Asserts that the focused call changes the rank of at least
4372 /// one non-focus file by ≥ 5% in either direction.
4373 ///
4374 /// On the baseline (pre-T1) this test passes when the caller supplies
4375 /// an int focus_idx directly — the engine's topic-sensitive PageRank is
4376 /// correct. The bug was at the string-to-int resolver layer in
4377 /// `tools.rs`, which silently masked the failure as "the rendering
4378 /// path doesn't propagate focus". This test locks the engine's
4379 /// behavior so a future regression in the rendering path is caught.
4380 #[test]
4381 #[expect(
4382 clippy::too_many_lines,
4383 reason = "synthetic Python-shaped graph (five FileNodes with defs + \
4384 CallRefs + ImportRefs) plus the two-call assertion sequence \
4385 is inherently long; splitting into helpers would obscure the \
4386 one-shot reproduction the test is locking in."
4387 )]
4388 fn test_focus_file_rank_delta_visible_on_python_corpus() {
4389 // Five files, Python-shaped: services/storage.py (a "hub" that two
4390 // UI files call into) plus a tests/ file. The Python tree-sitter
4391 // extractor produces `class_definition` and `function_definition`
4392 // kinds with resolved CallRefs pointing at the hub.
4393 let mut files: Vec<FileNode> = vec![
4394 FileNode {
4395 path: "device_opt/services/storage.py".to_string(),
4396 defs: vec![
4397 Definition {
4398 name: "ScanStore".to_string(),
4399 kind: "class_definition".to_string(),
4400 start_line: 1,
4401 end_line: 80,
4402 scope: String::new(),
4403 signature: None,
4404 start_byte: 0,
4405 end_byte: 2000,
4406 calls: vec![],
4407 },
4408 Definition {
4409 name: "save_scan".to_string(),
4410 kind: "function_definition".to_string(),
4411 start_line: 20,
4412 end_line: 40,
4413 scope: "class_definition ScanStore".to_string(),
4414 signature: Some("def save_scan(self, scan)".to_string()),
4415 start_byte: 200,
4416 end_byte: 600,
4417 calls: vec![],
4418 },
4419 ],
4420 imports: vec![],
4421 },
4422 FileNode {
4423 path: "device_opt/services/registry.py".to_string(),
4424 defs: vec![Definition {
4425 name: "register".to_string(),
4426 kind: "function_definition".to_string(),
4427 start_line: 1,
4428 end_line: 30,
4429 scope: String::new(),
4430 signature: Some("def register(svc)".to_string()),
4431 start_byte: 0,
4432 end_byte: 600,
4433 calls: vec![CallRef {
4434 name: "save_scan".to_string(),
4435 qualified_path: None,
4436 receiver_type: None,
4437 byte_offset: 100,
4438 resolved: None,
4439 }],
4440 }],
4441 imports: vec![ImportRef {
4442 raw_path: "from device_opt.services import storage".to_string(),
4443 resolved_idx: Some(0),
4444 }],
4445 },
4446 FileNode {
4447 path: "device_opt/ui/screens/browse.py".to_string(),
4448 defs: vec![Definition {
4449 name: "browse_scans".to_string(),
4450 kind: "function_definition".to_string(),
4451 start_line: 1,
4452 end_line: 50,
4453 scope: String::new(),
4454 signature: Some("def browse_scans(app)".to_string()),
4455 start_byte: 0,
4456 end_byte: 1000,
4457 calls: vec![CallRef {
4458 name: "save_scan".to_string(),
4459 qualified_path: None,
4460 receiver_type: None,
4461 byte_offset: 200,
4462 resolved: None,
4463 }],
4464 }],
4465 imports: vec![ImportRef {
4466 raw_path: "from device_opt.services import storage".to_string(),
4467 resolved_idx: Some(0),
4468 }],
4469 },
4470 FileNode {
4471 path: "device_opt/ui/screens/settings.py".to_string(),
4472 defs: vec![Definition {
4473 name: "open_settings".to_string(),
4474 kind: "function_definition".to_string(),
4475 start_line: 1,
4476 end_line: 40,
4477 scope: String::new(),
4478 signature: Some("def open_settings(app)".to_string()),
4479 start_byte: 0,
4480 end_byte: 800,
4481 calls: vec![CallRef {
4482 name: "register".to_string(),
4483 qualified_path: None,
4484 receiver_type: None,
4485 byte_offset: 150,
4486 resolved: None,
4487 }],
4488 }],
4489 imports: vec![ImportRef {
4490 raw_path: "from device_opt.services import registry".to_string(),
4491 resolved_idx: Some(1),
4492 }],
4493 },
4494 FileNode {
4495 path: "tests/test_storage.py".to_string(),
4496 defs: vec![Definition {
4497 name: "test_save".to_string(),
4498 kind: "function_definition".to_string(),
4499 start_line: 1,
4500 end_line: 20,
4501 scope: String::new(),
4502 signature: Some("def test_save()".to_string()),
4503 start_byte: 0,
4504 end_byte: 400,
4505 calls: vec![CallRef {
4506 name: "save_scan".to_string(),
4507 qualified_path: None,
4508 receiver_type: None,
4509 byte_offset: 50,
4510 resolved: None,
4511 }],
4512 }],
4513 imports: vec![ImportRef {
4514 raw_path: "from device_opt.services import storage".to_string(),
4515 resolved_idx: Some(0),
4516 }],
4517 },
4518 ];
4519
4520 // Resolve calls so the graph builder has edges to chew on.
4521 let def_index = build_def_index(&files);
4522 resolve_calls(&mut files, &def_index, &HashMap::new());
4523 let graph = build_graph_from_files_pub(files);
4524
4525 // Sanity: the graph must have edges (the calls were resolved).
4526 assert!(
4527 !graph.edges.is_empty(),
4528 "Python-shaped synthetic graph must produce file-level edges; got 0. \
4529 The CallRefs may have failed to resolve."
4530 );
4531
4532 // Resolve the focus file via the new helper.
4533 let focus_idx = match graph.resolve_focus_file("./device_opt/ui/screens/settings.py") {
4534 FocusResolution::Found(i) => i,
4535 other => panic!("resolver must find settings.py via LSP-shaped path, got {other:?}"),
4536 };
4537
4538 let budget = 4000;
4539 let unfocused = render_json_budgeted(&graph, budget, None, false);
4540 let focused = render_json_budgeted(&graph, budget, Some(focus_idx), false);
4541
4542 // Collect rank-by-path maps for both runs.
4543 let unfocused_ranks: std::collections::HashMap<String, f32> = unfocused
4544 .files
4545 .iter()
4546 .map(|f| (f.lsp_location.file_path.clone(), f.rank))
4547 .collect();
4548 let focused_ranks: std::collections::HashMap<String, f32> = focused
4549 .files
4550 .iter()
4551 .map(|f| (f.lsp_location.file_path.clone(), f.rank))
4552 .collect();
4553
4554 eprintln!("T1 Python — unfocused ranks: {unfocused_ranks:#?}");
4555 eprintln!("T1 Python — focused ranks: {focused_ranks:#?}");
4556
4557 // Find at least one non-focus file whose rank changed by ≥ 5% in
4558 // either direction. The threshold is conservative; the soft 0.15
4559 // personalization alpha redistributes mass enough that on this
4560 // 5-node graph the affected neighbors typically shift by 20%+.
4561 let focus_path = "./device_opt/ui/screens/settings.py";
4562 let mut max_delta_ratio = 0.0_f32;
4563 for (path, &u_rank) in &unfocused_ranks {
4564 if path == focus_path {
4565 continue;
4566 }
4567 if let Some(&f_rank) = focused_ranks.get(path)
4568 && u_rank > 0.0
4569 {
4570 let ratio = (f_rank - u_rank).abs() / u_rank;
4571 if ratio > max_delta_ratio {
4572 max_delta_ratio = ratio;
4573 }
4574 }
4575 }
4576 assert!(
4577 max_delta_ratio >= 0.05,
4578 "focus_file must rebias non-focus file ranks by ≥ 5%; \
4579 max observed delta ratio = {max_delta_ratio:.3} \
4580 (I#20: focus_file invisible on Python corpora)"
4581 );
4582
4583 // Bit-identity guard: at least one non-focus file's rank must NOT
4584 // equal its unfocused value. This is the pathology from the
4585 // mnemosyne reproduction: every rank value was bit-identical
4586 // across global/focused calls.
4587 let any_changed = unfocused_ranks.iter().any(|(path, &u_rank)| {
4588 path != focus_path
4589 && focused_ranks
4590 .get(path)
4591 .is_some_and(|&f_rank| f_rank.to_bits() != u_rank.to_bits())
4592 });
4593 assert!(
4594 any_changed,
4595 "no non-focus file rank changed across focused/unfocused calls — \
4596 bit-identical pathology (I#20). unfocused={unfocused_ranks:#?} \
4597 focused={focused_ranks:#?}"
4598 );
4599 }
4600
4601 /// T1: focus_file rank delta on a Rust-shaped synthetic graph.
4602 ///
4603 /// Regression test: confirms the engine's topic-sensitive PageRank
4604 /// works on Rust shapes (where T1's investigation found it already
4605 /// works, but the resolver fix must not break the existing path).
4606 ///
4607 /// This complements `test_focus_file_returns_neighborhood_not_just_focus`
4608 /// by additionally checking that (a) the resolver accepts a Rust path
4609 /// with the `./` LSP prefix, and (b) at least one non-focus file's
4610 /// rank moves by ≥ 5%.
4611 #[test]
4612 #[expect(
4613 clippy::too_many_lines,
4614 reason = "synthetic Rust-shaped graph with four FileNodes plus the \
4615 two-call assertion sequence inherently exceeds the 100-line \
4616 cap; the test mirrors the Python-shaped sibling."
4617 )]
4618 fn test_focus_file_rank_delta_preserved_on_rust_corpus() {
4619 let mut files: Vec<FileNode> = vec![
4620 FileNode {
4621 path: "src/lib.rs".to_string(),
4622 defs: vec![Definition {
4623 name: "Engine".to_string(),
4624 kind: "struct_item".to_string(),
4625 start_line: 1,
4626 end_line: 30,
4627 scope: String::new(),
4628 signature: None,
4629 start_byte: 0,
4630 end_byte: 600,
4631 calls: vec![],
4632 }],
4633 imports: vec![],
4634 },
4635 FileNode {
4636 path: "src/encoder/mod.rs".to_string(),
4637 defs: vec![Definition {
4638 name: "encode".to_string(),
4639 kind: "function_item".to_string(),
4640 start_line: 1,
4641 end_line: 40,
4642 scope: String::new(),
4643 signature: Some("fn encode(input: &str) -> Vec<f32>".to_string()),
4644 start_byte: 0,
4645 end_byte: 800,
4646 calls: vec![],
4647 }],
4648 imports: vec![ImportRef {
4649 raw_path: "use crate::lib;".to_string(),
4650 resolved_idx: Some(0),
4651 }],
4652 },
4653 FileNode {
4654 path: "src/search.rs".to_string(),
4655 defs: vec![Definition {
4656 name: "search".to_string(),
4657 kind: "function_item".to_string(),
4658 start_line: 1,
4659 end_line: 30,
4660 scope: String::new(),
4661 signature: Some("fn search(q: &str) -> Hits".to_string()),
4662 start_byte: 0,
4663 end_byte: 600,
4664 calls: vec![CallRef {
4665 name: "encode".to_string(),
4666 qualified_path: None,
4667 receiver_type: None,
4668 byte_offset: 100,
4669 resolved: None,
4670 }],
4671 }],
4672 imports: vec![ImportRef {
4673 raw_path: "use crate::encoder;".to_string(),
4674 resolved_idx: Some(1),
4675 }],
4676 },
4677 FileNode {
4678 path: "src/cli.rs".to_string(),
4679 defs: vec![Definition {
4680 name: "main".to_string(),
4681 kind: "function_item".to_string(),
4682 start_line: 1,
4683 end_line: 20,
4684 scope: String::new(),
4685 signature: Some("fn main()".to_string()),
4686 start_byte: 0,
4687 end_byte: 400,
4688 calls: vec![CallRef {
4689 name: "search".to_string(),
4690 qualified_path: None,
4691 receiver_type: None,
4692 byte_offset: 50,
4693 resolved: None,
4694 }],
4695 }],
4696 imports: vec![ImportRef {
4697 raw_path: "use crate::search;".to_string(),
4698 resolved_idx: Some(2),
4699 }],
4700 },
4701 ];
4702
4703 let def_index = build_def_index(&files);
4704 resolve_calls(&mut files, &def_index, &HashMap::new());
4705 let graph = build_graph_from_files_pub(files);
4706
4707 assert!(
4708 !graph.edges.is_empty(),
4709 "Rust-shaped synthetic graph must produce edges"
4710 );
4711
4712 let focus_idx = match graph.resolve_focus_file("./src/encoder/mod.rs") {
4713 FocusResolution::Found(i) => i,
4714 other => panic!("resolver must find encoder/mod.rs via LSP path, got {other:?}"),
4715 };
4716
4717 let budget = 4000;
4718 let unfocused = render_json_budgeted(&graph, budget, None, false);
4719 let focused = render_json_budgeted(&graph, budget, Some(focus_idx), false);
4720
4721 let unfocused_ranks: std::collections::HashMap<String, f32> = unfocused
4722 .files
4723 .iter()
4724 .map(|f| (f.lsp_location.file_path.clone(), f.rank))
4725 .collect();
4726 let focused_ranks: std::collections::HashMap<String, f32> = focused
4727 .files
4728 .iter()
4729 .map(|f| (f.lsp_location.file_path.clone(), f.rank))
4730 .collect();
4731
4732 eprintln!("T1 Rust — unfocused: {unfocused_ranks:#?}");
4733 eprintln!("T1 Rust — focused: {focused_ranks:#?}");
4734
4735 let focus_path = "./src/encoder/mod.rs";
4736 let mut max_delta_ratio = 0.0_f32;
4737 for (path, &u_rank) in &unfocused_ranks {
4738 if path == focus_path {
4739 continue;
4740 }
4741 if let Some(&f_rank) = focused_ranks.get(path)
4742 && u_rank > 0.0
4743 {
4744 let ratio = (f_rank - u_rank).abs() / u_rank;
4745 if ratio > max_delta_ratio {
4746 max_delta_ratio = ratio;
4747 }
4748 }
4749 }
4750 assert!(
4751 max_delta_ratio >= 0.05,
4752 "focus_file must rebias non-focus file ranks by ≥ 5% on Rust shapes; \
4753 max observed delta = {max_delta_ratio:.3}"
4754 );
4755 }
4756
4757 #[test]
4758 fn test_pagerank_empty() {
4759 let ranks = pagerank(0, &[], None);
4760 assert!(ranks.is_empty());
4761 }
4762
4763 #[test]
4764 fn test_render_tiers() {
4765 // Build a small graph with 10 files to exercise all tiers
4766 let files: Vec<FileNode> = (0..10)
4767 .map(|i| FileNode {
4768 path: format!("src/file_{i}.rs"),
4769 defs: vec![Definition {
4770 name: format!("func_{i}"),
4771 kind: "function_item".to_string(),
4772 start_line: 1,
4773 end_line: 5,
4774 scope: String::new(),
4775 signature: Some(format!("func_{i}(x: i32) -> i32")),
4776 start_byte: 0,
4777 end_byte: 0,
4778 calls: vec![],
4779 }],
4780 imports: vec![],
4781 })
4782 .collect();
4783
4784 // Create a star graph: files 1-9 all import from file 0
4785 let edges: Vec<(u32, u32, u32)> = (1..10).map(|i| (i, 0, 1)).collect();
4786 let base_ranks = pagerank(10, &edges, None);
4787 let (top_callers, top_callees) = build_neighbor_lists(10, &edges);
4788
4789 let graph = RepoGraph {
4790 files,
4791 edges,
4792 base_ranks,
4793 callers: top_callers,
4794 callees: top_callees,
4795 def_edges: vec![],
4796 def_ranks: vec![],
4797 def_callers: vec![],
4798 def_callees: vec![],
4799 def_offsets: vec![0],
4800 alpha: 0.5,
4801 };
4802
4803 // Large budget: should include all files
4804 let full = render(&graph, 10_000, None);
4805 assert!(
4806 full.contains("file_0"),
4807 "output should contain the top-ranked file"
4808 );
4809 // file_0 should appear as tier 0 (highest rank)
4810 assert!(
4811 full.contains("## src/file_0.rs"),
4812 "top file should have tier 0 heading"
4813 );
4814
4815 // Tiny budget: should only fit a few files
4816 let small = render(&graph, 10, None);
4817 assert!(
4818 !small.is_empty(),
4819 "even tiny budget should produce some output"
4820 );
4821 // Should have fewer entries than full render
4822 let full_lines = full.lines().count();
4823 let small_lines = small.lines().count();
4824 assert!(
4825 small_lines < full_lines,
4826 "small budget ({small_lines} lines) should have fewer lines than full ({full_lines})"
4827 );
4828 }
4829
4830 #[test]
4831 fn test_render_empty_graph() {
4832 let graph = RepoGraph {
4833 files: vec![],
4834 edges: vec![],
4835 base_ranks: vec![],
4836 callers: vec![],
4837 callees: vec![],
4838 def_edges: vec![],
4839 def_ranks: vec![],
4840 def_callers: vec![],
4841 def_callees: vec![],
4842 def_offsets: vec![0],
4843 alpha: 0.5,
4844 };
4845 let output = render(&graph, 1000, None);
4846 assert!(output.is_empty(), "empty graph should render empty string");
4847 }
4848
4849 #[test]
4850 fn test_build_graph_on_fixtures() {
4851 let fixtures = Path::new(env!("CARGO_MANIFEST_DIR"))
4852 .parent()
4853 .unwrap()
4854 .parent()
4855 .unwrap()
4856 .join("tests")
4857 .join("fixtures");
4858
4859 let graph = build_graph(&fixtures).expect("build_graph should succeed on fixtures");
4860
4861 // Should find at least the 3 fixture files
4862 assert!(
4863 !graph.files.is_empty(),
4864 "graph should contain files from fixtures"
4865 );
4866
4867 // Should find definitions in the Rust fixture
4868 let rs_file = graph.files.iter().find(|f| f.path.ends_with("sample.rs"));
4869 assert!(rs_file.is_some(), "should find sample.rs");
4870 let rs_file = rs_file.unwrap();
4871 assert!(
4872 !rs_file.defs.is_empty(),
4873 "sample.rs should have definitions"
4874 );
4875 assert!(
4876 rs_file.defs.iter().any(|d| d.name == "hello"),
4877 "should find 'hello' function in sample.rs"
4878 );
4879
4880 // Should find definitions in the Python fixture
4881 let py_file = graph.files.iter().find(|f| f.path.ends_with("sample.py"));
4882 assert!(py_file.is_some(), "should find sample.py");
4883 let py_file = py_file.unwrap();
4884 assert!(
4885 !py_file.defs.is_empty(),
4886 "sample.py should have definitions"
4887 );
4888 assert!(
4889 py_file.defs.iter().any(|d| d.name == "greet"),
4890 "should find 'greet' function in sample.py"
4891 );
4892
4893 // PageRank scores should be computed
4894 assert_eq!(graph.base_ranks.len(), graph.files.len());
4895 let sum: f32 = graph.base_ranks.iter().sum();
4896 assert!(
4897 (sum - 1.0).abs() < 0.01,
4898 "PageRank scores should sum to ~1.0, got {sum}"
4899 );
4900 }
4901
4902 #[test]
4903 fn test_extract_imports_rust() {
4904 let source = "use crate::foo::bar;\nuse std::collections::HashMap;\n";
4905 let (lang, query) = import_query_for_extension("rs").unwrap();
4906 let imports = extract_imports(source, &lang, &query);
4907 assert_eq!(imports.len(), 2);
4908 assert!(imports[0].contains("crate::foo::bar"));
4909 }
4910
4911 #[test]
4912 fn test_extract_imports_python_stub() {
4913 let source = "from typing import Protocol\nimport pkg.types\n";
4914 let (lang, query) = import_query_for_extension("pyi").unwrap();
4915 let imports = extract_imports(source, &lang, &query);
4916 assert_eq!(imports.len(), 2);
4917 assert!(imports[0].contains("from typing import Protocol"));
4918 assert!(imports[1].contains("import pkg.types"));
4919 }
4920
4921 #[test]
4922 fn test_resolve_python_import_to_stub_file() {
4923 let root = PathBuf::from("/project");
4924 let mut file_index = HashMap::new();
4925 file_index.insert(PathBuf::from("/project/pkg/types.pyi"), 1);
4926
4927 let result = resolve_python_import("import pkg.types", &root, &file_index);
4928 assert_eq!(result, Some(1));
4929 }
4930
4931 #[test]
4932 fn test_resolve_rust_crate_import() {
4933 let root = PathBuf::from("/project");
4934 let file_path = PathBuf::from("/project/src/main.rs");
4935 let mut file_index = HashMap::new();
4936 file_index.insert(PathBuf::from("/project/src/foo/bar.rs"), 1);
4937 file_index.insert(PathBuf::from("/project/src/main.rs"), 0);
4938
4939 let result = resolve_rust_import("use crate::foo::bar;", &file_path, &root, &file_index);
4940 assert_eq!(result, Some(1));
4941 }
4942
4943 #[test]
4944 fn test_resolve_rust_external_crate_dropped() {
4945 let root = PathBuf::from("/project");
4946 let file_path = PathBuf::from("/project/src/main.rs");
4947 let file_index = HashMap::new();
4948
4949 let result = resolve_rust_import(
4950 "use std::collections::HashMap;",
4951 &file_path,
4952 &root,
4953 &file_index,
4954 );
4955 assert_eq!(result, None, "external crate imports should be dropped");
4956 }
4957
4958 #[test]
4959 fn test_neighbor_lists() {
4960 // 0 -> 1, 0 -> 2, 1 -> 2
4961 let edges = vec![(0, 1, 1), (0, 2, 1), (1, 2, 1)];
4962 let (incoming, outgoing) = build_neighbor_lists(3, &edges);
4963
4964 // Node 2 should be called by 0 and 1
4965 assert!(incoming[2].contains(&0));
4966 assert!(incoming[2].contains(&1));
4967
4968 // Node 0 should call 1 and 2
4969 assert!(outgoing[0].contains(&1));
4970 assert!(outgoing[0].contains(&2));
4971 }
4972
4973 /// G1 (R2.3 issue a): A scoped call `mod_a::foo()` must store:
4974 /// - `name = "foo"` (bare identifier, for def-index lookup)
4975 /// - `qualified_path = Some("mod_a::foo")` (full path, for disambiguation)
4976 ///
4977 /// Before G1, `name` stored the full `"mod_a::foo"` path. After G1, `name`
4978 /// is always the bare trailing identifier and `qualified_path` carries the
4979 /// full path when the call is scoped.
4980 #[test]
4981 fn test_scoped_identifier_calls_preserve_path() {
4982 use crate::languages;
4983 use streaming_iterator::StreamingIterator as _;
4984
4985 let source = "
4986mod mod_a {
4987 pub fn foo() {}
4988}
4989mod mod_b {
4990 pub fn foo() {}
4991}
4992fn caller() {
4993 mod_a::foo();
4994 mod_b::foo();
4995}
4996";
4997 let call_config =
4998 languages::call_query_for_extension("rs").expect("Rust call config must exist");
4999 let lang_config =
5000 languages::config_for_extension("rs").expect("Rust lang config must exist");
5001
5002 let mut defs = {
5003 let mut parser = tree_sitter::Parser::new();
5004 parser.set_language(&lang_config.language).unwrap();
5005 let tree = parser.parse(source, None).unwrap();
5006 let mut cursor = tree_sitter::QueryCursor::new();
5007 let mut out = Vec::new();
5008 let mut matches =
5009 cursor.matches(&lang_config.query, tree.root_node(), source.as_bytes());
5010 while let Some(m) = matches.next() {
5011 let mut name = String::new();
5012 let mut def_node = None;
5013 for cap in m.captures {
5014 let cname = &lang_config.query.capture_names()[cap.index as usize];
5015 if *cname == "name" {
5016 name = source[cap.node.start_byte()..cap.node.end_byte()].to_string();
5017 } else if *cname == "def" {
5018 def_node = Some(cap.node);
5019 }
5020 }
5021 if let Some(node) = def_node {
5022 #[expect(clippy::cast_possible_truncation)]
5023 out.push(Definition {
5024 name,
5025 kind: node.kind().to_string(),
5026 start_line: node.start_position().row as u32 + 1,
5027 end_line: node.end_position().row as u32 + 1,
5028 scope: String::new(),
5029 signature: None,
5030 start_byte: node.start_byte() as u32,
5031 end_byte: node.end_byte() as u32,
5032 calls: vec![],
5033 });
5034 }
5035 }
5036 out
5037 };
5038
5039 extract_calls(source, &call_config, &mut defs);
5040
5041 // Find the `caller` function definition
5042 let caller_def = defs
5043 .iter()
5044 .find(|d| d.name == "caller")
5045 .expect("caller def");
5046
5047 // G1: bare name is "foo", qualified_path carries the module path.
5048 let call_names: Vec<&str> = caller_def.calls.iter().map(|c| c.name.as_str()).collect();
5049 let qualified_paths: Vec<Option<&str>> = caller_def
5050 .calls
5051 .iter()
5052 .map(|c| c.qualified_path.as_deref())
5053 .collect();
5054
5055 // Bare names must be the trailing identifier only.
5056 assert!(
5057 call_names.contains(&"foo"),
5058 "bare name 'foo' must appear for scoped calls; got: {call_names:?}"
5059 );
5060 // Qualified paths must carry the full scope.
5061 assert!(
5062 qualified_paths.contains(&Some("mod_a::foo")),
5063 "qualified_path 'mod_a::foo' must appear; got: {qualified_paths:?}"
5064 );
5065 assert!(
5066 qualified_paths.contains(&Some("mod_b::foo")),
5067 "qualified_path 'mod_b::foo' must appear; got: {qualified_paths:?}"
5068 );
5069 // Full paths must NOT appear in bare names.
5070 assert!(
5071 !call_names.contains(&"mod_a::foo"),
5072 "full path 'mod_a::foo' must not appear in bare name; got: {call_names:?}"
5073 );
5074 }
5075
5076 /// RED test (R2.3 issue b+c): Two defs named `Read` in different modules,
5077 /// an unqualified call to `Read`. Resolution must NOT silently pick the first.
5078 /// Either both are returned (ambiguous) or none.
5079 #[test]
5080 fn test_ambiguous_name_resolution_returns_all_or_none() {
5081 // Build two FileNodes each with a def named "Read", then a third with an
5082 // unqualified call to "Read".
5083 let file_a = FileNode {
5084 path: "mod_a.rs".to_string(),
5085 defs: vec![Definition {
5086 name: "Read".to_string(),
5087 kind: "trait_item".to_string(),
5088 start_line: 1,
5089 end_line: 3,
5090 scope: String::new(),
5091 signature: None,
5092 start_byte: 0,
5093 end_byte: 50,
5094 calls: vec![],
5095 }],
5096 imports: vec![],
5097 };
5098 let file_b = FileNode {
5099 path: "mod_b.rs".to_string(),
5100 defs: vec![Definition {
5101 name: "Read".to_string(),
5102 kind: "trait_item".to_string(),
5103 start_line: 1,
5104 end_line: 3,
5105 scope: String::new(),
5106 signature: None,
5107 start_byte: 0,
5108 end_byte: 50,
5109 calls: vec![],
5110 }],
5111 imports: vec![],
5112 };
5113 let file_c = FileNode {
5114 path: "caller.rs".to_string(),
5115 defs: vec![Definition {
5116 name: "do_thing".to_string(),
5117 kind: "function_item".to_string(),
5118 start_line: 1,
5119 end_line: 5,
5120 scope: String::new(),
5121 signature: None,
5122 start_byte: 0,
5123 end_byte: 100,
5124 calls: vec![CallRef {
5125 name: "Read".to_string(),
5126 qualified_path: None,
5127 receiver_type: None,
5128 byte_offset: 10,
5129 resolved: None,
5130 }],
5131 }],
5132 imports: vec![],
5133 };
5134
5135 let mut files = vec![file_a, file_b, file_c];
5136 let def_index = build_def_index(&files);
5137 resolve_calls(&mut files, &def_index, &HashMap::new());
5138
5139 // The unqualified call to "Read" is ambiguous (two candidates, neither in same
5140 // file nor imported). Resolution must leave it as None — silent first-wins is wrong.
5141 let resolved = files[2].defs[0].calls[0].resolved;
5142 assert_eq!(
5143 resolved, None,
5144 "ambiguous unqualified call with no import context must resolve to None, not silently pick first"
5145 );
5146 }
5147
5148 // ── D1 / D2 tests ────────────────────────────────────────────────
5149
5150 /// Build a small test graph with N files and an optional JSON-extension file.
5151 fn build_test_graph(n_code: usize, include_json: bool) -> (RepoGraph, Vec<usize>) {
5152 let mut file_nodes: Vec<FileNode> = (0..n_code)
5153 .map(|i| FileNode {
5154 path: format!("src/file_{i}.rs"),
5155 defs: vec![
5156 Definition {
5157 name: format!("func_{i}"),
5158 kind: "function_item".to_string(),
5159 start_line: 1,
5160 end_line: 5,
5161 scope: String::new(),
5162 signature: Some(format!("fn func_{i}() -> i32")),
5163 start_byte: 0,
5164 end_byte: 100,
5165 calls: vec![],
5166 },
5167 Definition {
5168 name: format!("MyStruct{i}"),
5169 kind: "struct_item".to_string(),
5170 start_line: 7,
5171 end_line: 10,
5172 scope: String::new(),
5173 signature: None,
5174 start_byte: 110,
5175 end_byte: 200,
5176 calls: vec![],
5177 },
5178 ],
5179 imports: vec![],
5180 })
5181 .collect();
5182
5183 let json_idx = if include_json {
5184 let idx = file_nodes.len();
5185 file_nodes.push(FileNode {
5186 path: "data/config.json".to_string(),
5187 defs: vec![],
5188 imports: vec![],
5189 });
5190 vec![idx]
5191 } else {
5192 vec![]
5193 };
5194
5195 // Build a star graph: all code files point to file_0.
5196 let n = file_nodes.len();
5197 #[expect(clippy::cast_possible_truncation, reason = "test: n_code << u32::MAX")]
5198 let edges: Vec<(u32, u32, u32)> = (1..n_code).map(|i| (i as u32, 0, 1)).collect();
5199
5200 let base_ranks = pagerank(n, &edges, None);
5201 let (callers, callees) = build_neighbor_lists(n, &edges);
5202
5203 let graph = RepoGraph {
5204 files: file_nodes,
5205 edges,
5206 base_ranks,
5207 callers,
5208 callees,
5209 def_edges: vec![],
5210 def_ranks: vec![],
5211 def_callers: vec![],
5212 def_callees: vec![],
5213 def_offsets: vec![0],
5214 alpha: 0.5,
5215 };
5216
5217 (graph, json_idx)
5218 }
5219
5220 /// D1: `render_json` returns a `GetRepoMapResponse` with a `files` array.
5221 ///
5222 /// On the baseline (before D1) `get_repo_map_ripvec` returned markdown prose via
5223 /// `repo_map::render`; no `files` key existed in the output.
5224 #[test]
5225 fn get_repo_map_returns_json_with_files_array() {
5226 let (graph, _) = build_test_graph(5, false);
5227 let response = render_json(&graph, 50, None, false);
5228 assert!(
5229 !response.files.is_empty(),
5230 "files array should be non-empty for a non-empty graph"
5231 );
5232 // Serialize and verify the JSON shape has a `files` key.
5233 let json = serde_json::to_string(&response).expect("serialize");
5234 let parsed: serde_json::Value = serde_json::from_str(&json).expect("parse");
5235 assert!(
5236 parsed["files"].is_array(),
5237 "serialized response must have a `files` JSON array; got: {parsed}"
5238 );
5239 }
5240
5241 /// D1: every file entry has an `lsp_location` field.
5242 ///
5243 /// Before D1, output was prose text; no `lsp_location` existed anywhere in the response.
5244 #[test]
5245 fn get_repo_map_each_file_has_lsp_location() {
5246 let (graph, _) = build_test_graph(5, false);
5247 let response = render_json(&graph, 50, None, false);
5248 for file in &response.files {
5249 assert!(
5250 !file.lsp_location.file_path.is_empty(),
5251 "each file must have a non-empty lsp_location.file_path"
5252 );
5253 }
5254 // Also verify through JSON.
5255 let json = serde_json::to_string(&response).expect("serialize");
5256 let parsed: serde_json::Value = serde_json::from_str(&json).expect("parse");
5257 for entry in parsed["files"].as_array().expect("files array") {
5258 assert!(
5259 entry["lsp_location"]["file_path"].is_string(),
5260 "each file entry must have lsp_location.file_path string; entry: {entry}"
5261 );
5262 }
5263 }
5264
5265 /// D1: every symbol has a `kind` (u32) and an `lsp_location`.
5266 ///
5267 /// Before D1 symbols were rendered as prose strings like `"function_item func_0"`.
5268 #[test]
5269 fn get_repo_map_each_symbol_has_kind_and_lsp_location() {
5270 let (graph, _) = build_test_graph(3, false);
5271 let response = render_json(&graph, 50, None, false);
5272 for file in &response.files {
5273 for sym in &file.symbols {
5274 assert!(
5275 sym.kind > 0,
5276 "symbol kind must be a positive LSP SymbolKind; got 0 for '{}'",
5277 sym.name
5278 );
5279 assert!(
5280 !sym.lsp_location.file_path.is_empty(),
5281 "symbol must have lsp_location.file_path"
5282 );
5283 }
5284 }
5285 // Verify through JSON: kind should be a number.
5286 let json = serde_json::to_string(&response).expect("serialize");
5287 let parsed: serde_json::Value = serde_json::from_str(&json).expect("parse");
5288 for file_entry in parsed["files"].as_array().expect("files") {
5289 for sym_entry in file_entry["symbols"].as_array().expect("symbols") {
5290 assert!(
5291 sym_entry["kind"].is_number(),
5292 "symbol `kind` must be a JSON number; sym: {sym_entry}"
5293 );
5294 assert!(
5295 sym_entry["lsp_location"]["file_path"].is_string(),
5296 "symbol must have lsp_location.file_path; sym: {sym_entry}"
5297 );
5298 }
5299 }
5300 }
5301
5302 /// D1: `calls` field is an array of `RepoMapCall`-shaped objects (each has
5303 /// `lsp_location` and `rank`).
5304 ///
5305 /// In 4.0.1 calls moved from bare `lsp_location` objects to `RepoMapCall`
5306 /// objects that carry both the target `lsp_location` and the target file's
5307 /// `base_rank`.
5308 #[test]
5309 fn get_repo_map_calls_field_is_array_of_lsp_locations() {
5310 // Build a 5-file star graph so file_0 has non-empty callees.
5311 let (graph, _) = build_test_graph(5, false);
5312 let response = render_json(&graph, 50, None, false);
5313 let json = serde_json::to_string(&response).expect("serialize");
5314 let parsed: serde_json::Value = serde_json::from_str(&json).expect("parse");
5315 for file_entry in parsed["files"].as_array().expect("files") {
5316 let calls = file_entry["calls"]
5317 .as_array()
5318 .expect("calls must be an array");
5319 for call in calls {
5320 // In 4.0.1 each call entry is a RepoMapCall with lsp_location + rank.
5321 assert!(
5322 call["lsp_location"]["file_path"].is_string(),
5323 "each call entry must have lsp_location.file_path string; call: {call}"
5324 );
5325 assert!(
5326 call["rank"].is_number(),
5327 "each call entry must have a numeric rank; call: {call}"
5328 );
5329 }
5330 }
5331 }
5332
5333 /// D2 / G3: `render_json_budgeted` with a very tight budget returns fewer files.
5334 ///
5335 /// Before the budget allocator, `max_files=3` controlled file count but not
5336 /// per-file expansion. In 4.0.1 the token_budget controls total bytes; with
5337 /// a budget of 1 token (= 4 bytes) only the envelope minimum allows any file
5338 /// at all, and the test verifies that the total_files counter still reflects
5339 /// the full eligible count. `render_json` (compat shim) passes a generous
5340 /// budget; use `render_json_budgeted` with a tight budget to verify the cap.
5341 #[test]
5342 fn get_repo_map_returns_at_most_max_files_files() {
5343 let (graph, _) = build_test_graph(10, false);
5344 // Use render_json_budgeted directly with a tight budget (600 bytes = 3 files
5345 // × 200-byte floor). Each file's envelope minimum is 200 bytes so a 600-byte
5346 // budget should admit at most 3 files.
5347 let response = render_json_budgeted(&graph, 150, None, false);
5348 assert!(
5349 response.files.len() <= 3,
5350 "files.len() = {} must be <= 3 for a 600-byte budget",
5351 response.files.len()
5352 );
5353 assert_eq!(
5354 response.total_files, 10,
5355 "total_files must reflect the full eligible count before budget cap"
5356 );
5357 assert!(
5358 response.capped,
5359 "capped must be true when total_files > files.len()"
5360 );
5361 }
5362
5363 /// D2: `include_metadata=false` (default) excludes JSON/TOML/etc. files.
5364 ///
5365 /// Before D2, JSON files with thousands of repeated keys dominated the
5366 /// output (Issue #5 — JSON-key flooding).
5367 #[test]
5368 fn get_repo_map_excludes_meta_by_default() {
5369 let (graph, _) = build_test_graph(3, /*include_json=*/ true);
5370 // Default: include_metadata = false
5371 let response = render_json(&graph, 50, None, false);
5372 for file in &response.files {
5373 assert!(
5374 !std::path::Path::new(&file.lsp_location.file_path)
5375 .extension()
5376 .is_some_and(|e| e.eq_ignore_ascii_case("json")),
5377 "JSON (Meta) files must be excluded when include_metadata=false; found: {}",
5378 file.lsp_location.file_path
5379 );
5380 }
5381 }
5382
5383 /// D2: `include_metadata=true` includes JSON files.
5384 ///
5385 /// Callers who opt-in to metadata should see all content kinds.
5386 #[test]
5387 fn get_repo_map_include_metadata_true_includes_json() {
5388 let (graph, _) = build_test_graph(3, /*include_json=*/ true);
5389 let response = render_json(&graph, 50, None, true);
5390 let has_json = response.files.iter().any(|f| {
5391 std::path::Path::new(&f.lsp_location.file_path)
5392 .extension()
5393 .is_some_and(|e| e.eq_ignore_ascii_case("json"))
5394 });
5395 assert!(
5396 has_json,
5397 "JSON file must be present when include_metadata=true"
5398 );
5399 }
5400
5401 /// J1/J2 MEASUREMENT: flask corpus focus_file=blueprints.py rank dispersion.
5402 ///
5403 /// Mandatory measurement from the 4.0.5 Wave-2 Front-C briefing:
5404 /// - `len(files) >= 8` (not collapsed to just the focus)
5405 /// - focus file rank is the highest in the response
5406 /// - next 5 files all have rank >= 10% of focus rank
5407 /// - neighborhood contains semantically related files (app.py, scaffold.py)
5408 #[test]
5409 #[ignore = "runs on flask corpus at tests/corpus/code/flask; use --ignored --nocapture"]
5410 #[expect(
5411 clippy::too_many_lines,
5412 reason = "end-to-end corpus measurement test; assertion sequence is sequential and cannot be meaningfully split"
5413 )]
5414 fn test_flask_focus_blueprints_rank_dispersion() {
5415 let corpus_root = Path::new(env!("CARGO_MANIFEST_DIR"))
5416 .parent()
5417 .unwrap()
5418 .parent()
5419 .unwrap()
5420 .join("tests/corpus/code/flask");
5421
5422 assert!(
5423 corpus_root.exists(),
5424 "flask corpus not found at {}",
5425 corpus_root.display()
5426 );
5427
5428 let graph = build_graph(&corpus_root).expect("build_graph on flask corpus");
5429 eprintln!("Flask corpus: {} files in graph", graph.files.len());
5430
5431 // Find focus file
5432 let focus_path = "src/flask/blueprints.py";
5433 let focus_idx = graph.files.iter().position(|f| f.path == focus_path);
5434 eprintln!("Focus file '{focus_path}' -> idx: {focus_idx:?}");
5435 assert!(
5436 focus_idx.is_some(),
5437 "blueprints.py not found in graph; available files: {:?}",
5438 graph
5439 .files
5440 .iter()
5441 .map(|f| &f.path)
5442 .take(20)
5443 .collect::<Vec<_>>()
5444 );
5445
5446 let response = render_json_budgeted(&graph, 4000, focus_idx, false);
5447
5448 // Criterion 1: at least 8 files returned.
5449 eprintln!(
5450 "Focused response: {} files (total_files={})",
5451 response.files.len(),
5452 response.total_files
5453 );
5454 assert!(
5455 response.files.len() >= 8,
5456 "expected >= 8 files in focused response; got {} — I#16 winner-take-all collapse",
5457 response.files.len()
5458 );
5459
5460 // Print top 10 for inspection.
5461 eprintln!("\nTop 10 focused files:");
5462 for (i, f) in response.files.iter().take(10).enumerate() {
5463 eprintln!(" [{i}] rank={:.6} {}", f.rank, f.lsp_location.file_path);
5464 }
5465
5466 // Criterion 2: focus file must appear near the top (top-3) of focused
5467 // results. With PERSONALIZATION_ALPHA=0.15 and the flask corpus,
5468 // src/flask/app.py has higher structural rank than blueprints.py and
5469 // may legitimately rank #1 — the focus boosts blueprints.py relative
5470 // to its unfocused position, but doesn't guarantee it beats every
5471 // structurally central neighbor. Being in top-3 confirms the bias
5472 // is working (pre-fix blueprints.py was #1 at 0.703 but that was a
5473 // degenerate collapse; now #1 or #2 is healthy).
5474 let focus_file_rank = response
5475 .files
5476 .iter()
5477 .find(|f| {
5478 f.lsp_location.file_path.contains("blueprints.py")
5479 && !f.lsp_location.file_path.contains("test_")
5480 && !f.lsp_location.file_path.contains("sansio")
5481 })
5482 .map(|f| f.rank)
5483 .unwrap_or(0.0);
5484 let focus_position = response
5485 .files
5486 .iter()
5487 .position(|f| {
5488 f.lsp_location.file_path.contains("blueprints.py")
5489 && !f.lsp_location.file_path.contains("test_")
5490 && !f.lsp_location.file_path.contains("sansio")
5491 })
5492 .unwrap_or(usize::MAX);
5493 eprintln!(
5494 "\nblueprinets.py position: #{} rank={:.6}",
5495 focus_position + 1,
5496 focus_file_rank
5497 );
5498 assert!(
5499 focus_position < 3,
5500 "blueprints.py must be in top-3 focused results (got position {}); \
5501 soft personalization must rebias toward focus neighborhood — I#16",
5502 focus_position + 1
5503 );
5504
5505 // Criterion 3: next 5 non-focus files have rank >= 10% of the top
5506 // file's rank. This is the core dispersion check: no more Dirac-delta
5507 // collapse where one file is 0.703 and all others are 0.003.
5508 let top_rank = response.files[0].rank;
5509 let non_focus_min_5 = response
5510 .files
5511 .iter()
5512 .filter(|f| {
5513 !(f.lsp_location.file_path.contains("blueprints.py")
5514 && !f.lsp_location.file_path.contains("test_")
5515 && !f.lsp_location.file_path.contains("sansio"))
5516 })
5517 .take(5)
5518 .map(|f| f.rank)
5519 .fold(f32::INFINITY, f32::min);
5520 let pct = non_focus_min_5 / top_rank * 100.0;
5521 eprintln!(
5522 "\nNext-5 (non-focus) min rank: {non_focus_min_5:.6} = {pct:.1}% of top rank {top_rank:.6}"
5523 );
5524 assert!(
5525 pct >= 10.0,
5526 "next-5 non-focus files min rank is {pct:.1}% of top rank (need ≥ 10%); \
5527 files are collapsing to near-zero floor — I#16"
5528 );
5529
5530 // Criterion 4: neighborhood quality — related files present.
5531 let related_names = ["app.py", "scaffold.py", "sansio"];
5532 let found_related: Vec<&str> = related_names
5533 .iter()
5534 .copied()
5535 .filter(|name| {
5536 response
5537 .files
5538 .iter()
5539 .any(|f| f.lsp_location.file_path.contains(name))
5540 })
5541 .collect();
5542 eprintln!("\nNeighborhood quality: found related files: {found_related:?}");
5543 // At least one related file should appear (soft assertion — log if missing).
5544 if found_related.is_empty() {
5545 eprintln!(
5546 "WARNING: no expected related files (app.py, scaffold.py) found in neighborhood"
5547 );
5548 }
5549 }
5550
5551 #[test]
5552 #[ignore = "runs on full ripvec codebase; use --nocapture to see output"]
5553 fn test_full_repo_map() {
5554 use std::time::Instant;
5555
5556 let root = Path::new(env!("CARGO_MANIFEST_DIR"))
5557 .parent()
5558 .unwrap()
5559 .parent()
5560 .unwrap();
5561
5562 // Phase 1: build_graph (walk + parse + import resolve + PageRank)
5563 let t0 = Instant::now();
5564 let graph = build_graph(root).expect("build_graph on ripvec root");
5565 let build_ms = t0.elapsed().as_secs_f64() * 1000.0;
5566
5567 // Phase 2: render (default, no focus)
5568 let t1 = Instant::now();
5569 let rendered = render(&graph, 2000, None);
5570 let render_ms = t1.elapsed().as_secs_f64() * 1000.0;
5571
5572 // Phase 3: render (topic-sensitive, focused on highest-ranked file)
5573 let t2 = Instant::now();
5574 let focus_idx = graph
5575 .base_ranks
5576 .iter()
5577 .enumerate()
5578 .max_by(|a, b| a.1.total_cmp(b.1))
5579 .map(|(i, _)| i);
5580 let focused = render(&graph, 2000, focus_idx);
5581 let focus_ms = t2.elapsed().as_secs_f64() * 1000.0;
5582
5583 eprintln!("\n=== Repo Map Performance ===");
5584 eprintln!(
5585 "Files: {}, Edges: {}, Defs: {}",
5586 graph.files.len(),
5587 graph.edges.len(),
5588 graph.files.iter().map(|f| f.defs.len()).sum::<usize>()
5589 );
5590 eprintln!("build_graph: {build_ms:.1}ms (walk + parse + resolve + PageRank)");
5591 eprintln!(
5592 "render(default): {render_ms:.3}ms ({} chars, ~{} tokens)",
5593 rendered.len(),
5594 rendered.len() / 4
5595 );
5596 eprintln!(
5597 "render(focused): {focus_ms:.3}ms ({} chars, ~{} tokens)",
5598 focused.len(),
5599 focused.len() / 4
5600 );
5601
5602 eprintln!("\nTop 5 by PageRank:");
5603 let mut ranked: Vec<(usize, f32)> = graph.base_ranks.iter().copied().enumerate().collect();
5604 ranked.sort_by(|a, b| b.1.total_cmp(&a.1));
5605 for (i, rank) in ranked.iter().take(5) {
5606 eprintln!(" {:.4} {}", rank, graph.files[*i].path);
5607 }
5608
5609 eprintln!("\n=== Default Render ===\n{rendered}");
5610 eprintln!(
5611 "\n=== Focused Render (on {}) ===\n{focused}",
5612 focus_idx
5613 .map(|i| graph.files[i].path.as_str())
5614 .unwrap_or("none")
5615 );
5616 }
5617}