Skip to main content

ripvec_core/
repo_map.rs

1//! `PageRank`-weighted structural overview of a codebase.
2//!
3//! Builds a dependency graph from tree-sitter definition and import extraction,
4//! ranks files by importance using `PageRank` (standard or topic-sensitive), and
5//! renders a budget-constrained overview with tiered detail levels.
6
7use std::collections::{HashMap, HashSet};
8use std::fmt::Write as _;
9use std::path::{Path, PathBuf};
10
11use rayon::prelude::*;
12use rkyv::{Archive, Deserialize as RkyvDeserialize, Serialize as RkyvSerialize};
13use streaming_iterator::StreamingIterator;
14use tree_sitter::{Parser, Query, QueryCursor};
15
16use serde::Serialize;
17
18use crate::chunk::ContentKind;
19use crate::languages;
20use crate::walk;
21
22/// Serialize a `ContentKind` to a lowercase string tag for JSON output.
23fn content_kind_tag(ck: ContentKind) -> &'static str {
24    match ck {
25        ContentKind::Code => "code",
26        ContentKind::Docs => "docs",
27        ContentKind::Meta => "meta",
28    }
29}
30
31// ── Data Structures ──────────────────────────────────────────────────
32
33/// Persisted dependency graph with `PageRank` scores.
34#[derive(Debug, Clone, Archive, RkyvSerialize, RkyvDeserialize)]
35pub struct RepoGraph {
36    /// Files in the repository with definitions, imports, and calls.
37    pub files: Vec<FileNode>,
38    /// File-level edges (derived from def-level call edges).
39    pub edges: Vec<(u32, u32, u32)>,
40    /// File-level `PageRank` scores (aggregated from def-level).
41    pub base_ranks: Vec<f32>,
42    /// File-level callers (indices into `files`).
43    pub callers: Vec<Vec<u32>>,
44    /// File-level callees (indices into `files`).
45    pub callees: Vec<Vec<u32>>,
46    /// Definition-level call edges: `(caller_def, callee_def, weight)`.
47    pub def_edges: Vec<(DefId, DefId, u32)>,
48    /// Definition-level `PageRank` scores (flattened: `offsets[file_idx] + def_idx`).
49    pub def_ranks: Vec<f32>,
50    /// Definition-level callers (flattened, parallel to `def_ranks`).
51    pub def_callers: Vec<Vec<DefId>>,
52    /// Definition-level callees (flattened, parallel to `def_ranks`).
53    pub def_callees: Vec<Vec<DefId>>,
54    /// Prefix-sum offsets for flattening `DefId` to linear index.
55    pub def_offsets: Vec<usize>,
56    /// Auto-tuned alpha for search boost.
57    pub alpha: f32,
58}
59
60/// A file in the repository with its definitions and imports.
61#[derive(Debug, Clone, Archive, RkyvSerialize, RkyvDeserialize)]
62pub struct FileNode {
63    /// Relative path from the repository root.
64    pub path: String,
65    /// Definitions (functions, structs, classes, etc.) extracted from this file.
66    pub defs: Vec<Definition>,
67    /// Import references extracted from this file.
68    pub imports: Vec<ImportRef>,
69}
70
71/// A definition extracted from a source file.
72#[derive(Debug, Clone, Archive, RkyvSerialize, RkyvDeserialize)]
73pub struct Definition {
74    /// Name of the definition (e.g., function name, class name).
75    pub name: String,
76    /// Kind of syntax node (e.g., `function_item`, `class_definition`).
77    pub kind: String,
78    /// 1-based start line number.
79    pub start_line: u32,
80    /// 1-based end line number.
81    pub end_line: u32,
82    /// Scope chain (e.g., `"impl_item Foo > fn bar"`).
83    pub scope: String,
84    /// Function/method signature, if available.
85    pub signature: Option<String>,
86    /// Byte offset of this definition's start in the source file.
87    pub start_byte: u32,
88    /// Byte offset of this definition's end in the source file.
89    pub end_byte: u32,
90    /// Call sites within this definition's body.
91    pub calls: Vec<CallRef>,
92}
93
94/// An import reference extracted from a source file.
95#[derive(Debug, Clone, Archive, RkyvSerialize, RkyvDeserialize)]
96pub struct ImportRef {
97    /// Raw import path as written in source (e.g., `crate::foo::bar`).
98    pub raw_path: String,
99    /// Resolved file index in [`RepoGraph::files`], if resolution succeeded.
100    pub resolved_idx: Option<u32>,
101}
102
103/// Unique identifier for a definition: (file index, definition index within file).
104pub type DefId = (u32, u16);
105
106/// A call site extracted from a definition body.
107#[derive(Debug, Clone, Default, Archive, RkyvSerialize, RkyvDeserialize)]
108pub struct CallRef {
109    /// Callee function/method name (bare, without qualifier).
110    ///
111    /// For scoped calls like `mod_a::foo()`, this is `"foo"`.
112    /// For bare calls like `foo()`, this is `"foo"`.
113    pub name: String,
114    /// Full qualified path for scoped calls, e.g. `Some("mod_a::foo")`.
115    ///
116    /// `None` for bare (unqualified) calls. When `Some`, `resolve_calls`
117    /// uses this for qualifier-based module disambiguation before falling
118    /// back to the bare `name`.
119    pub qualified_path: Option<String>,
120    /// Receiver type for method calls, inferred from local context.
121    ///
122    /// Set to `Some("Foo")` when:
123    /// - The call is `self.method()` inside `impl Foo { … }`.
124    /// - The call is `x.method()` where `x` has an explicit type annotation `x: Foo`.
125    /// - The call is `x.method()` after `let x = Foo::new()`.
126    ///
127    /// `None` for free function calls, or when the receiver type cannot be
128    /// inferred from local context alone. When `Some`, `resolve_calls` prefers
129    /// defs whose enclosing impl scope matches the receiver type.
130    pub receiver_type: Option<String>,
131    /// Byte offset of the call in the source file (for scoping to definitions).
132    pub byte_offset: u32,
133    /// Resolved target definition, if resolution succeeded.
134    pub resolved: Option<DefId>,
135}
136
137// ── JSON output types ────────────────────────────────────────────────
138
139/// LSP-shaped location pointing at a file or symbol within a file.
140///
141/// Lines and characters are 0-based, matching the Language Server Protocol
142/// convention so callers can pass this directly to LSP tools without any
143/// conversion.
144#[derive(Debug, Clone, Serialize)]
145pub struct RepoMapLspLocation {
146    /// Relative path from the repository root (prefixed with `./`).
147    pub file_path: String,
148    /// 0-based start line.
149    pub start_line: usize,
150    /// 0-based start character (0 for file-level locations).
151    pub start_character: usize,
152    /// 0-based end line (equals `start_line` for file-level locations).
153    pub end_line: usize,
154    /// 0-based end character (0 for file-level locations).
155    pub end_character: usize,
156}
157
158/// A top-level symbol extracted from a file in the repository map.
159///
160/// Analogous to an LSP `DocumentSymbol` but limited to the fields available
161/// from tree-sitter definition extraction. The `rank` field carries the
162/// definition-level `PageRank` score from [`RepoGraph::def_ranks`], enabling
163/// callers to prioritise symbols by structural importance.
164#[derive(Debug, Clone, Serialize)]
165pub struct RepoMapSymbol {
166    /// Symbol name (function name, struct name, etc.).
167    pub name: String,
168    /// LSP `SymbolKind` as a decimal — use the same values as
169    /// `lsp_workspace_symbols` and `lsp_document_symbols`.
170    pub kind: u32,
171    /// Location pointing at the symbol's definition line (0-based).
172    pub lsp_location: RepoMapLspLocation,
173    /// Definition-level `PageRank` score from [`RepoGraph::def_ranks`].
174    ///
175    /// Higher values indicate definitions that are called by many other
176    /// definitions. Used by the token-budget allocator to decide which
177    /// symbols to include when the per-file budget is constrained.
178    pub rank: f32,
179}
180
181/// An outgoing call-edge from a file to another file.
182///
183/// Carries both the target file's `lsp_location` and its `base_rank`
184/// (file-level `PageRank` score) so callers can decide how important
185/// each dependency is without a separate lookup.
186#[derive(Debug, Clone, Serialize)]
187pub struct RepoMapCall {
188    /// Location pointing at the target file (line 0, character 0).
189    pub lsp_location: RepoMapLspLocation,
190    /// File-level `PageRank` score of the target file.
191    pub rank: f32,
192}
193
194/// One file entry in the JSON repo map.
195///
196/// Carries the file's `PageRank` score, content kind, outgoing call-edges to
197/// other files, and the file's top-level symbol definitions — all with
198/// `lsp_location` so the caller can chain directly into LSP tools without
199/// any destructuring.
200#[derive(Debug, Clone, Serialize)]
201pub struct RepoMapFile {
202    /// Location pointing at the file itself (line 0, character 0).
203    ///
204    /// Pass `lsp_location.file_path` directly into `lsp_document_symbols` or
205    /// any other file-scoped tool.
206    pub lsp_location: RepoMapLspLocation,
207    /// `PageRank` score in [0, 1] (higher = more structurally central).
208    pub rank: f32,
209    /// Content classification: `"code"`, `"docs"`, or `"meta"`.
210    ///
211    /// Serialized as a lowercase string tag so JSON consumers can branch
212    /// without numeric magic values. Mirrors the `ContentKind` enum in
213    /// `ripvec-core::chunk`.
214    pub content_kind: &'static str,
215    /// Outgoing call-edges sorted by target file `PageRank` descending.
216    pub calls: Vec<RepoMapCall>,
217    /// Top-level definitions extracted from this file by tree-sitter,
218    /// sorted by definition-level `PageRank` descending and pruned to
219    /// the per-file token-budget allocation.
220    pub symbols: Vec<RepoMapSymbol>,
221    /// Number of symbols that were omitted due to budget exhaustion or
222    /// logarithmic attenuation cutoff. `truncated_symbols + symbols.len()`
223    /// equals the total definition count for the file.
224    pub truncated_symbols: usize,
225    /// Number of call-edges that were omitted due to budget exhaustion
226    /// or logarithmic attenuation cutoff. `truncated_calls + calls.len()`
227    /// equals the total callee count for the file.
228    pub truncated_calls: usize,
229}
230
231/// JSON-mode response envelope for `get_repo_map` (4.0.1 shape).
232///
233/// Replaces the `max_files`-capped shape from 4.0.0. The caller supplies a
234/// `token_budget`; files are allocated bytes proportional to their `PageRank`
235/// (40% cap per file, 200-byte envelope floor). Symbols are filled in
236/// def-rank order with a logarithmic attenuation cutoff. Leftover bytes
237/// cascade to subsequent files.
238///
239/// The `estimated_bytes`, `budget_bytes`, and `budget_exhausted` fields give
240/// callers real-time feedback on how tightly the budget was consumed.
241#[derive(Debug, Clone, Serialize)]
242pub struct GetRepoMapResponse {
243    /// Files sorted by `PageRank` descending, pruned to the token budget.
244    pub files: Vec<RepoMapFile>,
245    /// Total number of eligible files in the graph (pre-allocation).
246    ///
247    /// If `total_files > files.len()`, the budget ran out before all files
248    /// could be included. Read `budget_exhausted` directly for the boolean.
249    pub total_files: usize,
250    /// Actual serialised-JSON byte count for all returned content.
251    pub estimated_bytes: usize,
252    /// Budget ceiling in bytes that was used for allocation
253    /// (`token_budget * 4`).
254    pub budget_bytes: usize,
255    /// `true` when `total_files > files.len()` (budget was exhausted before
256    /// all eligible files were included).
257    pub budget_exhausted: bool,
258    /// Retained for backward compatibility with 4.0.0 callers that checked
259    /// `capped`. Equivalent to `budget_exhausted`.
260    pub capped: bool,
261}
262
263// ── Constants ────────────────────────────────────────────────────────
264
265/// `PageRank` damping factor.
266const DAMPING: f32 = 0.85;
267
268/// `PageRank` convergence threshold.
269const EPSILON: f32 = 1e-6;
270
271/// Maximum `PageRank` iterations.
272const MAX_ITERATIONS: usize = 100;
273
274/// Maximum callers/callees stored per file.
275const MAX_NEIGHBORS: usize = 5;
276
277/// Approximate characters per token for budget estimation.
278const CHARS_PER_TOKEN: usize = 4;
279
280/// Concentration mass placed on the focus node in topic-sensitive `PageRank`.
281///
282/// Following Haveliwala 2002 ("Topic-Sensitive PageRank"), the personalization
283/// vector places a small bias `α` on the focus node and distributes the
284/// remaining `1 - α` uniformly over all other nodes. This preserves rank
285/// dispersion across the corpus — the user sees a *neighborhood* of related
286/// files rebiased toward the focus, not a Dirac delta on the focus node with
287/// every other file collapsed to an equally negligible uniform floor.
288///
289/// Value 0.15 means:
290///   - focus node teleportation probability = 0.15
291///   - each of the (n - 1) other nodes = 0.85 / (n - 1)
292///
293/// Prior to 4.0.5, this constant was effectively 0.70 (70% bias on focus),
294/// which caused winner-take-all collapse observed on the flask and ripvec
295/// corpora (I#16).
296const PERSONALIZATION_ALPHA: f32 = 0.15;
297
298// ── Import Queries ───────────────────────────────────────────────────
299
300/// Compile a tree-sitter import query for the given extension.
301///
302/// Returns `None` for unsupported extensions.
303fn import_query_for_extension(ext: &str) -> Option<(tree_sitter::Language, Query)> {
304    let (lang, query_str): (tree_sitter::Language, &str) = match ext {
305        "rs" => (
306            tree_sitter_rust::LANGUAGE.into(),
307            "(use_declaration) @import",
308        ),
309        "py" | "pyi" => (
310            tree_sitter_python::LANGUAGE.into(),
311            concat!(
312                "(import_statement) @import\n",
313                "(import_from_statement) @import",
314            ),
315        ),
316        "js" | "jsx" => (
317            tree_sitter_javascript::LANGUAGE.into(),
318            "(import_statement source: (string) @import_path) @import",
319        ),
320        "ts" => (
321            tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into(),
322            "(import_statement source: (string) @import_path) @import",
323        ),
324        "tsx" => (
325            tree_sitter_typescript::LANGUAGE_TSX.into(),
326            "(import_statement source: (string) @import_path) @import",
327        ),
328        "go" => (
329            tree_sitter_go::LANGUAGE.into(),
330            "(import_spec path: (interpreted_string_literal) @import_path) @import",
331        ),
332        // Ruby: require statements.
333        "rb" => (
334            tree_sitter_ruby::LANGUAGE.into(),
335            "(call method: (identifier) @_method arguments: (argument_list (string (string_content) @import_path)) (#eq? @_method \"require\")) @import",
336        ),
337        _ => return None,
338    };
339    let query = match Query::new(&lang, query_str) {
340        Ok(q) => q,
341        Err(e) => {
342            tracing::warn!(ext, %e, "import query compilation failed — language may be ABI-incompatible");
343            return None;
344        }
345    };
346    Some((lang, query))
347}
348
349/// Extract import paths from source using tree-sitter.
350fn extract_imports(
351    source: &str,
352    lang: &tree_sitter::Language,
353    import_query: &Query,
354) -> Vec<String> {
355    let mut parser = Parser::new();
356    if parser.set_language(lang).is_err() {
357        return vec![];
358    }
359    let Some(tree) = parser.parse(source, None) else {
360        return vec![];
361    };
362
363    let mut cursor = QueryCursor::new();
364    let mut imports = Vec::new();
365    let mut matches = cursor.matches(import_query, tree.root_node(), source.as_bytes());
366
367    while let Some(m) = matches.next() {
368        // Prefer @import_path capture (JS/TS/Go), fall back to full @import text
369        let mut import_path_text = None;
370        let mut import_text = None;
371
372        for cap in m.captures {
373            let cap_name = &import_query.capture_names()[cap.index as usize];
374            let text = &source[cap.node.start_byte()..cap.node.end_byte()];
375            if *cap_name == "import_path" {
376                import_path_text = Some(text.trim_matches(|c| c == '"' || c == '\''));
377            } else if *cap_name == "import" {
378                import_text = Some(text);
379            }
380        }
381
382        if let Some(path) = import_path_text {
383            imports.push(path.to_string());
384        } else if let Some(text) = import_text {
385            imports.push(text.to_string());
386        }
387    }
388
389    imports
390}
391
392// ── Import Resolution ────────────────────────────────────────────────
393
394/// Resolve a Rust `use` path to a file index in the file map.
395///
396/// Handles `crate::`, `self::`, and `super::` prefixes. External crate
397/// imports are dropped (returns `None`).
398fn resolve_rust_import(
399    raw: &str,
400    file_path: &Path,
401    root: &Path,
402    file_index: &HashMap<PathBuf, usize>,
403) -> Option<usize> {
404    // Extract the module path from `use crate::foo::bar;` or `use crate::foo::bar::Baz;`
405    let trimmed = raw
406        .trim()
407        .trim_start_matches("use ")
408        .trim_end_matches(';')
409        .trim();
410
411    let segments: Vec<&str> = trimmed.split("::").collect();
412    if segments.is_empty() {
413        return None;
414    }
415
416    // Determine the base directory and skip prefix segments
417    let (base, skip) = match segments[0] {
418        "crate" => {
419            // Find the nearest Cargo.toml ancestor to determine the crate root.
420            // In a workspace, `crate::foo` resolves relative to the crate's src/,
421            // not the workspace root.
422            let mut dir = file_path.parent();
423            let crate_root = loop {
424                match dir {
425                    Some(d) if d.join("Cargo.toml").exists() => break d.join("src"),
426                    Some(d) => dir = d.parent(),
427                    None => break root.join("src"), // fallback
428                }
429            };
430            (crate_root, 1)
431        }
432        "self" => {
433            let dir = file_path.parent()?;
434            (dir.to_path_buf(), 1)
435        }
436        "super" => {
437            let dir = file_path.parent()?.parent()?;
438            (dir.to_path_buf(), 1)
439        }
440        // External crate — drop
441        _ => return None,
442    };
443
444    // Build candidate paths from the remaining segments.
445    // Try progressively shorter prefixes since the last segments
446    // may be items (struct, fn) rather than modules.
447    let path_segments = &segments[skip..];
448    for end in (1..=path_segments.len()).rev() {
449        let mut candidate = base.clone();
450        for seg in &path_segments[..end] {
451            // Strip glob patterns like `{Foo, Bar}`
452            let clean = seg.split('{').next().unwrap_or(seg).trim();
453            if !clean.is_empty() {
454                candidate.push(clean);
455            }
456        }
457
458        // Try file.rs
459        let as_file = candidate.with_extension("rs");
460        if let Some(&idx) = file_index.get(&as_file) {
461            return Some(idx);
462        }
463
464        // Try dir/mod.rs
465        let as_mod = candidate.join("mod.rs");
466        if let Some(&idx) = file_index.get(&as_mod) {
467            return Some(idx);
468        }
469    }
470
471    None
472}
473
474/// Resolve an import path to a file index based on file extension.
475fn resolve_import(
476    raw: &str,
477    ext: &str,
478    file_path: &Path,
479    root: &Path,
480    file_index: &HashMap<PathBuf, usize>,
481) -> Option<usize> {
482    match ext {
483        "rs" => resolve_rust_import(raw, file_path, root, file_index),
484        "py" | "pyi" => resolve_python_import(raw, root, file_index),
485        "js" | "jsx" | "ts" | "tsx" => resolve_js_import(raw, file_path, file_index),
486        // Go imports use full package paths — skip local resolution
487        _ => None,
488    }
489}
490
491/// Resolve a Python import to a file index.
492///
493/// Handles `import foo.bar` and `from foo.bar import baz` patterns.
494fn resolve_python_import(
495    raw: &str,
496    root: &Path,
497    file_index: &HashMap<PathBuf, usize>,
498) -> Option<usize> {
499    let module_path = if let Some(rest) = raw.strip_prefix("from ") {
500        rest.split_whitespace().next()?
501    } else if let Some(rest) = raw.strip_prefix("import ") {
502        rest.split_whitespace().next()?
503    } else {
504        return None;
505    };
506
507    let rel_path: PathBuf = module_path.split('.').collect();
508    for ext in ["py", "pyi"] {
509        let as_file = root.join(&rel_path).with_extension(ext);
510        if let Some(&idx) = file_index.get(&as_file) {
511            return Some(idx);
512        }
513    }
514
515    for init_name in ["__init__.py", "__init__.pyi"] {
516        let as_init = root.join(&rel_path).join(init_name);
517        if let Some(&idx) = file_index.get(&as_init) {
518            return Some(idx);
519        }
520    }
521
522    None
523}
524
525/// Resolve a JS/TS import to a file index.
526///
527/// Handles relative paths like `./foo` or `../bar`.
528fn resolve_js_import(
529    raw: &str,
530    file_path: &Path,
531    file_index: &HashMap<PathBuf, usize>,
532) -> Option<usize> {
533    if !raw.starts_with('.') {
534        return None;
535    }
536
537    let dir = file_path.parent()?;
538    let candidate = dir.join(raw);
539
540    for ext in &["js", "jsx", "ts", "tsx"] {
541        let with_ext = candidate.with_extension(ext);
542        if let Some(&idx) = file_index.get(&with_ext) {
543            return Some(idx);
544        }
545    }
546
547    for ext in &["js", "jsx", "ts", "tsx"] {
548        let index_file = candidate.join("index").with_extension(ext);
549        if let Some(&idx) = file_index.get(&index_file) {
550            return Some(idx);
551        }
552    }
553
554    None
555}
556
557// ── Extraction ───────────────────────────────────────────────────────
558
559/// Extract definitions from a source file using tree-sitter.
560fn extract_definitions(source: &str, config: &languages::LangConfig) -> Vec<Definition> {
561    let mut parser = Parser::new();
562    if parser.set_language(&config.language).is_err() {
563        return vec![];
564    }
565    let Some(tree) = parser.parse(source, None) else {
566        return vec![];
567    };
568
569    let mut cursor = QueryCursor::new();
570    let mut defs = Vec::new();
571    let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
572
573    while let Some(m) = matches.next() {
574        let mut name = String::new();
575        let mut def_node = None;
576
577        for cap in m.captures {
578            let cap_name = &config.query.capture_names()[cap.index as usize];
579            if *cap_name == "name" {
580                name = source[cap.node.start_byte()..cap.node.end_byte()].to_string();
581            } else if *cap_name == "def" {
582                def_node = Some(cap.node);
583            }
584        }
585
586        if let Some(node) = def_node {
587            let scope = crate::chunk::build_scope_chain(node, source);
588            let signature = crate::chunk::extract_signature(node, source);
589            #[expect(clippy::cast_possible_truncation, reason = "line numbers fit in u32")]
590            let start_line = node.start_position().row as u32 + 1;
591            #[expect(clippy::cast_possible_truncation, reason = "line numbers fit in u32")]
592            let end_line = node.end_position().row as u32 + 1;
593            #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
594            let start_byte = node.start_byte() as u32;
595            #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
596            let end_byte = node.end_byte() as u32;
597            defs.push(Definition {
598                name,
599                kind: node.kind().to_string(),
600                start_line,
601                end_line,
602                scope,
603                signature,
604                start_byte,
605                end_byte,
606                calls: vec![],
607            });
608        }
609    }
610
611    defs
612}
613
614// ── Call Extraction & Resolution ────────────────────────────────────
615
616/// Tiebreak priority for def attribution when two defs share the same byte span.
617///
618/// Returns `0` for function-like defs (lowest value = wins in `min_by_key`) and
619/// `1` for structural container defs (class bodies, impl blocks, etc.).
620///
621/// This resolves the Python case where the class body `block` and the first
622/// `function_definition` inside it occupy identical byte ranges; calls inside
623/// the function body should be attributed to the function, not the class block.
624fn is_callable_def_priority(kind: &str) -> u8 {
625    match kind {
626        // Function / method defs: these are the correct attribution targets.
627        "function_item"
628        | "function_definition"
629        | "function_declaration"
630        | "function_signature_item"
631        | "method_definition"
632        | "method_declaration"
633        | "method" => 0,
634        // Structural containers: class body blocks, impl items, etc.
635        // Prefer function-like defs over these when byte ranges tie.
636        _ => 1,
637    }
638}
639
640/// Extract call sites from a source file and assign them to definitions.
641///
642/// Uses the language's call query to find all call expressions, then
643/// assigns each call to the definition whose byte range contains it.
644/// Calls outside any definition body (module-level) are ignored.
645///
646/// For Rust scoped calls (`a::b::foo()`), the `@callee` capture returns the
647/// full `scoped_identifier` node. This function splits it into:
648/// - `name` = bare trailing identifier (`"foo"`)
649/// - `qualified_path` = `Some("a::b::foo")` for disambiguation in `resolve_calls`.
650///
651/// For method calls (`x.method()`), `receiver_type` is inferred from local
652/// context (parameter annotations, let-bindings, impl blocks). See
653/// [`infer_receiver_types`] for the heuristic.
654fn extract_calls(source: &str, call_config: &languages::CallConfig, defs: &mut [Definition]) {
655    let mut parser = Parser::new();
656    if parser.set_language(&call_config.language).is_err() {
657        return;
658    }
659    let Some(tree) = parser.parse(source, None) else {
660        return;
661    };
662
663    // Build receiver-type map: byte_offset_of_call → receiver_type_string.
664    // Done once per file to amortise the tree walk cost.
665    let receiver_map = infer_receiver_types(source, &tree, &call_config.language);
666
667    // HCL: run the HCL-specific call-edge extractor as a post-pass so the
668    // terraform_remote_state references and module blocks contribute edges
669    // that the generic function_call query cannot capture (R2 + R3, Wave 3).
670    if languages::is_hcl_language(&call_config.language) {
671        extract_hcl_call_edges(source, tree.root_node(), defs);
672    }
673
674    let mut cursor = QueryCursor::new();
675    let mut matches = cursor.matches(&call_config.query, tree.root_node(), source.as_bytes());
676
677    while let Some(m) = matches.next() {
678        let mut full_callee_text = None;
679        let mut call_byte = 0u32;
680
681        for cap in m.captures {
682            let cap_name = &call_config.query.capture_names()[cap.index as usize];
683            if *cap_name == "callee" {
684                full_callee_text =
685                    Some(source[cap.node.start_byte()..cap.node.end_byte()].to_string());
686                #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
687                {
688                    call_byte = cap.node.start_byte() as u32;
689                }
690            }
691        }
692
693        if let Some(full_text) = full_callee_text {
694            // Split qualified path into bare name + optional qualifier.
695            let (name, qualified_path) = if full_text.contains("::") {
696                let bare = full_text
697                    .rsplit("::")
698                    .next()
699                    .unwrap_or(&full_text)
700                    .to_string();
701                (bare, Some(full_text))
702            } else {
703                (full_text, None)
704            };
705
706            // Look up receiver type from the pre-built map.
707            let receiver_type = receiver_map.get(&call_byte).cloned();
708
709            // Assign to the most-specific (smallest byte range) enclosing definition.
710            // Using `find` (first match) was incorrect for nested defs: an `impl_item`
711            // wrapping a `function_item` both contain the call site, but the
712            // `function_item` is the correct granularity for method attribution.
713            //
714            // Tiebreak: when two defs have equal byte spans (as happens in Python where
715            // the class body `block` and its first `function_definition` share the same
716            // start/end bytes), prefer function-like defs over structural container defs.
717            // `is_callable_def` returns 0 for function-like kinds (sorts first in min_by_key).
718            let enclosing_idx = defs
719                .iter()
720                .enumerate()
721                .filter(|(_, d)| d.start_byte <= call_byte && call_byte < d.end_byte)
722                .min_by_key(|(_, d)| (d.end_byte - d.start_byte, is_callable_def_priority(&d.kind)))
723                .map(|(i, _)| i);
724
725            if let Some(idx) = enclosing_idx {
726                // Skip self-recursive calls (compare bare name to def name).
727                if defs[idx].name != name {
728                    defs[idx].calls.push(CallRef {
729                        name,
730                        qualified_path,
731                        receiver_type,
732                        byte_offset: call_byte,
733                        resolved: None,
734                    });
735                }
736            }
737            // Calls outside any definition are ignored (module-level init).
738        }
739    }
740}
741
742// HCL: post-pass call-edge extraction for terraform_remote_state and module
743// blocks. These are not function calls — they are HCL-specific structural
744// references to other Terraform modules — so the generic
745// `(function_call (identifier) @callee) @call` pattern in
746// `call_query_for_extension("tf")` cannot capture them. This helper runs
747// once per HCL file inside `extract_calls` (R2 + R3, Wave 3).
748
749/// Walk an HCL parse tree and emit CallRef entries for:
750///
751/// 1. `data.terraform_remote_state.<NAME>.outputs.<ATTR>` expressions:
752///    one CallRef per reference, with `name = NAME` and
753///    `qualified_path = Some("terraform_remote_state.NAME")`. These
754///    connect the current file to the named remote-state module's outputs.
755///
756/// 2. `module "X" { source = "../X" }` blocks: one CallRef with
757///    `name = X` (the label) and `qualified_path = Some("module.X")`.
758///    The module reference connects to the module's directory in
759///    `resolve_import` (HCL module-source resolution is not implemented
760///    yet — the qualified_path carrier is the contract; resolve adds the
761///    file lookup).
762///
763/// Each emitted CallRef is attached to the smallest enclosing definition
764/// by byte range — matching the same heuristic used by `extract_calls`.
765fn extract_hcl_call_edges(source: &str, root: tree_sitter::Node<'_>, defs: &mut [Definition]) {
766    // Walk all named descendants iteratively.
767    let mut stack: Vec<tree_sitter::Node<'_>> = vec![root];
768    while let Some(node) = stack.pop() {
769        // Defer to a function-style helper per node kind.
770        match node.kind() {
771            "expression" => hcl_visit_expression(source, node, defs),
772            "block" => hcl_visit_block(source, node, defs),
773            _ => {}
774        }
775        // Recurse into named children.
776        let mut cursor = node.walk();
777        for child in node.children(&mut cursor) {
778            if child.is_named() {
779                stack.push(child);
780            }
781        }
782    }
783}
784
785/// Inspect an HCL `expression` node for the
786/// `data.terraform_remote_state.<NAME>.outputs.<ATTR>` reference pattern.
787///
788/// The expression tree looks like:
789/// ```text
790/// expression
791///   variable_expr
792///     identifier "data"
793///   get_attr
794///     identifier "terraform_remote_state"
795///   get_attr
796///     identifier "<NAME>"
797///   get_attr
798///     identifier "outputs"
799///   get_attr
800///     identifier "<ATTR>"
801/// ```
802fn hcl_visit_expression(source: &str, node: tree_sitter::Node<'_>, defs: &mut [Definition]) {
803    // Collect children: must be `variable_expr` (with identifier="data")
804    // followed by a chain of `get_attr` nodes (each with an `identifier` child).
805    let mut cursor = node.walk();
806    let mut child_iter = node.children(&mut cursor);
807    let Some(first) = child_iter.next() else {
808        return;
809    };
810    if first.kind() != "variable_expr" {
811        return;
812    }
813    let Some(first_id) = first.child_by_field_name("name").or_else(|| {
814        // Fallback: find first named child that's an identifier.
815        let mut c = first.walk();
816        first.children(&mut c).find(|n| n.kind() == "identifier")
817    }) else {
818        return;
819    };
820    if &source[first_id.start_byte()..first_id.end_byte()] != "data" {
821        return;
822    }
823
824    // Collect identifiers from the chain of get_attr.
825    let mut chain: Vec<String> = Vec::new();
826    for child in child_iter {
827        if child.kind() != "get_attr" {
828            return; // not a pure attribute chain
829        }
830        let mut gc = child.walk();
831        let id = child.children(&mut gc).find(|n| n.kind() == "identifier");
832        let Some(id_node) = id else { return };
833        chain.push(source[id_node.start_byte()..id_node.end_byte()].to_string());
834    }
835
836    // Expect: terraform_remote_state, <NAME>, outputs, <ATTR>
837    if chain.len() < 2 || chain[0] != "terraform_remote_state" {
838        return;
839    }
840    let name = chain[1].clone();
841    let qualified_path = format!("terraform_remote_state.{name}");
842
843    #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
844    let call_byte = node.start_byte() as u32;
845    attach_hcl_call(defs, call_byte, name, Some(qualified_path));
846}
847
848/// Inspect an HCL `block` node for the `module "X" { source = "../X" }`
849/// pattern. Emits one CallRef per matching block.
850fn hcl_visit_block(source: &str, node: tree_sitter::Node<'_>, defs: &mut [Definition]) {
851    // First child must be identifier="module".
852    let mut cursor = node.walk();
853    let children: Vec<tree_sitter::Node<'_>> = node.children(&mut cursor).collect();
854    let Some(first) = children.first() else {
855        return;
856    };
857    if first.kind() != "identifier" || &source[first.start_byte()..first.end_byte()] != "module" {
858        return;
859    }
860    // Next child should be a string_lit (the module label).
861    let label_node = children.iter().find(|c| c.kind() == "string_lit");
862    let Some(label_node) = label_node else {
863        return;
864    };
865    let mut lc = label_node.walk();
866    let template = label_node
867        .children(&mut lc)
868        .find(|n| n.kind() == "template_literal");
869    let Some(template) = template else {
870        return;
871    };
872    let label = source[template.start_byte()..template.end_byte()].to_string();
873    let qualified_path = format!("module.{label}");
874
875    #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
876    let call_byte = node.start_byte() as u32;
877    attach_hcl_call(defs, call_byte, label, Some(qualified_path));
878}
879
880/// Attach a synthesized HCL CallRef to the smallest enclosing definition.
881/// Mirrors the byte-range attribution from `extract_calls`.
882fn attach_hcl_call(
883    defs: &mut [Definition],
884    call_byte: u32,
885    name: String,
886    qualified_path: Option<String>,
887) {
888    let enclosing_idx = defs
889        .iter()
890        .enumerate()
891        .filter(|(_, d)| d.start_byte <= call_byte && call_byte < d.end_byte)
892        .min_by_key(|(_, d)| (d.end_byte - d.start_byte, is_callable_def_priority(&d.kind)))
893        .map(|(i, _)| i);
894    if let Some(idx) = enclosing_idx {
895        // Skip self-recursive emission (would happen if the enclosing def
896        // happens to share the same `name` as the synthesized callee).
897        if defs[idx].name != name {
898            defs[idx].calls.push(CallRef {
899                name,
900                qualified_path,
901                receiver_type: None,
902                byte_offset: call_byte,
903                resolved: None,
904            });
905        }
906    }
907}
908
909/// Infer method-call receiver types from local context within a parse tree.
910///
911/// Returns a map from `byte_offset_of_@callee_capture` to a receiver type string.
912///
913/// Dispatches to a language-specific collector:
914///
915/// - **Rust**: [`collect_rust_receiver_types`] — three heuristic cases:
916///   1. `self.method()` inside `impl Foo { … }` → `"Foo"`.
917///   2. `x.method()` where `x: Bar` is a function parameter → `"Bar"`.
918///   3. `x.method()` after `let x = Foo::new()` → `"Foo"`.
919///
920/// - **Python**: [`collect_python_receiver_types`] — two heuristic cases:
921///   1. `self.method()` inside a class method → class name from enclosing
922///      `class_definition`.
923///   2. `instance.method()` where `instance: ClassName` type annotation or
924///      `instance = ClassName(...)` assignment is visible in the same scope.
925///
926/// - **Go**: [`collect_go_receiver_types`] — one heuristic case:
927///   1. `recv.Method()` inside a `method_declaration` where `recv` is the
928///      named receiver parameter → receiver type from the method signature.
929///
930/// This is heuristic, not type-inference-complete. Unknown/ambiguous cases
931/// produce no entry in the map; `extract_calls` leaves those `receiver_type = None`.
932fn infer_receiver_types(
933    source: &str,
934    tree: &tree_sitter::Tree,
935    language: &tree_sitter::Language,
936) -> HashMap<u32, String> {
937    let mut map: HashMap<u32, String> = HashMap::new();
938
939    if languages::is_rust_language(language) {
940        collect_rust_receiver_types(source, tree.root_node(), &mut map);
941    } else if languages::is_python_language(language) {
942        collect_python_receiver_types(source, tree.root_node(), &mut map);
943    } else if languages::is_go_language(language) {
944        collect_go_receiver_types(source, tree.root_node(), &mut map);
945    }
946    // Other languages: no receiver inference — leave map empty.
947
948    map
949}
950
951/// Walk the Rust parse tree and fill `map` with receiver-type inference.
952///
953/// This is a recursive descent that tracks:
954/// - The current `impl Foo` or `impl Foo for Bar` type name (for `self.*` calls).
955/// - Parameter type annotations (for `x: SomeType` → `x` has type `SomeType`).
956/// - Constructor let-bindings (`let x = Foo::new()` → `x` has type `Foo`).
957fn collect_rust_receiver_types(
958    source: &str,
959    node: tree_sitter::Node<'_>,
960    map: &mut HashMap<u32, String>,
961) {
962    // We use a stack-based walk to avoid deep recursion on large files.
963    // Each stack entry carries (node, impl_type_context).
964    let mut stack: Vec<(tree_sitter::Node<'_>, Option<String>)> = vec![(node, None)];
965
966    while let Some((n, impl_ctx)) = stack.pop() {
967        match n.kind() {
968            "impl_item" => {
969                // Extract `impl Foo` or `impl Trait for Foo` → capture the `for` type.
970                // tree-sitter-rust shape: `(impl_item type: (type_identifier) @type)` for
971                // inherent impls, and `(impl_item trait: … type: (type_identifier) @type)`
972                // for trait impls. Both have a child named `type`.
973                let impl_type = extract_impl_self_type(source, n);
974                let new_ctx = impl_type.or_else(|| impl_ctx.clone());
975                let mut cursor = n.walk();
976                for child in n.children(&mut cursor) {
977                    stack.push((child, new_ctx.clone()));
978                }
979            }
980            "function_item" => {
981                // Build parameter bindings: (param_name → type_name).
982                let param_types = extract_param_types(source, n);
983                // Build let-binding type map from constructor calls.
984                let let_types = extract_let_binding_types(source, n);
985                // Annotate call sites within this function body.
986                annotate_method_calls(
987                    source,
988                    n,
989                    impl_ctx.as_deref(),
990                    &param_types,
991                    &let_types,
992                    map,
993                );
994                // Do NOT recurse into function_item children with the outer stack —
995                // function bodies are fully handled by annotate_method_calls.
996                // (Nested fn items would re-enter via their own impl_item context.)
997                // Push children with same impl_ctx so nested impl blocks are found.
998                let mut cursor = n.walk();
999                for child in n.children(&mut cursor) {
1000                    stack.push((child, impl_ctx.clone()));
1001                }
1002            }
1003            _ => {
1004                let mut cursor = n.walk();
1005                for child in n.children(&mut cursor) {
1006                    stack.push((child, impl_ctx.clone()));
1007                }
1008            }
1009        }
1010    }
1011}
1012
1013/// Extract the self type from an `impl_item` node.
1014///
1015/// For `impl Foo { … }` → `Some("Foo")`.
1016/// For `impl Trait for Foo { … }` → `Some("Foo")` (the concrete `for` type).
1017fn extract_impl_self_type(source: &str, impl_node: tree_sitter::Node<'_>) -> Option<String> {
1018    // tree-sitter-rust: impl_item has a field named "type" for the self type.
1019    // For `impl Foo for Bar { }`, "type" is Bar; for `impl Foo { }`, "type" is Foo.
1020    let type_node = impl_node.child_by_field_name("type")?;
1021    Some(source[type_node.start_byte()..type_node.end_byte()].to_string())
1022}
1023
1024/// Extract parameter name → type mappings from a function signature.
1025///
1026/// Handles `fn foo(x: Bar, y: Baz)` → `{"x": "Bar", "y": "Baz"}`.
1027/// The `self`/`&self`/`&mut self` parameter is skipped (handled via impl_ctx).
1028fn extract_param_types(source: &str, fn_node: tree_sitter::Node<'_>) -> HashMap<String, String> {
1029    let mut params: HashMap<String, String> = HashMap::new();
1030    let Some(params_node) = fn_node.child_by_field_name("parameters") else {
1031        return params;
1032    };
1033    let mut cursor = params_node.walk();
1034    for param in params_node.children(&mut cursor) {
1035        if param.kind() == "parameter" {
1036            // parameter has children: pattern (identifier) and type
1037            let mut param_name = None;
1038            let mut param_type = None;
1039            let mut pc = param.walk();
1040            for child in param.children(&mut pc) {
1041                match child.kind() {
1042                    "identifier" | "mutable_specifier" if param_name.is_none() => {
1043                        let text = source[child.start_byte()..child.end_byte()].to_string();
1044                        if text != "mut" {
1045                            param_name = Some(text);
1046                        }
1047                    }
1048                    "type_identifier"
1049                    | "generic_type"
1050                    | "reference_type"
1051                    | "scoped_type_identifier"
1052                        if param_type.is_none() =>
1053                    {
1054                        // Extract the base type identifier from potentially complex types.
1055                        param_type = Some(extract_base_type(source, child));
1056                    }
1057                    _ => {}
1058                }
1059            }
1060            if let (Some(name), Some(ty)) = (param_name, param_type)
1061                && !ty.is_empty()
1062            {
1063                params.insert(name, ty);
1064            }
1065        }
1066        // Also handle typed_pattern in newer grammars
1067        if param.kind() == "typed_pattern" {
1068            let mut name_part = None;
1069            let mut type_part = None;
1070            let mut pc = param.walk();
1071            for child in param.children(&mut pc) {
1072                if child.kind() == "identifier" && name_part.is_none() {
1073                    name_part = Some(source[child.start_byte()..child.end_byte()].to_string());
1074                } else if matches!(
1075                    child.kind(),
1076                    "type_identifier"
1077                        | "generic_type"
1078                        | "reference_type"
1079                        | "scoped_type_identifier"
1080                ) && type_part.is_none()
1081                {
1082                    type_part = Some(extract_base_type(source, child));
1083                }
1084            }
1085            if let (Some(name), Some(ty)) = (name_part, type_part)
1086                && !ty.is_empty()
1087            {
1088                params.insert(name, ty);
1089            }
1090        }
1091    }
1092    params
1093}
1094
1095/// Extract the base `TypeIdentifier` from a potentially complex type node.
1096///
1097/// For `Bar`, `&Bar`, `&mut Bar`, `Bar<T>` → returns `"Bar"`.
1098/// For `module::Bar` → returns `"Bar"` (bare name for matching).
1099fn extract_base_type(source: &str, node: tree_sitter::Node<'_>) -> String {
1100    match node.kind() {
1101        "type_identifier" => source[node.start_byte()..node.end_byte()].to_string(),
1102        "generic_type" | "reference_type" | "mutable_specifier" | "scoped_type_identifier" => {
1103            // Recurse to find the innermost type_identifier
1104            let mut cursor = node.walk();
1105            for child in node.children(&mut cursor) {
1106                let t = extract_base_type(source, child);
1107                if !t.is_empty() {
1108                    return t;
1109                }
1110            }
1111            String::new()
1112        }
1113        _ => {
1114            // For other nodes, try children
1115            let mut cursor = node.walk();
1116            for child in node.children(&mut cursor) {
1117                if child.kind() == "type_identifier" {
1118                    return source[child.start_byte()..child.end_byte()].to_string();
1119                }
1120            }
1121            String::new()
1122        }
1123    }
1124}
1125
1126/// Scan a function body for `let x = Foo::new()` patterns.
1127///
1128/// Returns a map from local variable name to the constructor type name.
1129/// E.g., `let x = Foo::new();` → `{"x": "Foo"}`.
1130fn extract_let_binding_types(
1131    source: &str,
1132    fn_node: tree_sitter::Node<'_>,
1133) -> HashMap<String, String> {
1134    let mut bindings: HashMap<String, String> = HashMap::new();
1135
1136    let Some(body) = fn_node.child_by_field_name("body") else {
1137        return bindings;
1138    };
1139
1140    // Walk the function body looking for let_declaration nodes.
1141    let mut stack = vec![body];
1142    while let Some(n) = stack.pop() {
1143        if n.kind() == "let_declaration" {
1144            // let_declaration: pattern (identifier) + value (call_expression or …)
1145            let mut binding_name = None;
1146            let mut constructor_type = None;
1147            let mut cursor = n.walk();
1148            for child in n.children(&mut cursor) {
1149                match child.kind() {
1150                    "identifier" if binding_name.is_none() => {
1151                        binding_name =
1152                            Some(source[child.start_byte()..child.end_byte()].to_string());
1153                    }
1154                    "call_expression" => {
1155                        // Look for `Foo::new()` or `Foo::from(…)` patterns.
1156                        // The function child of call_expression is a scoped_identifier.
1157                        if let Some(func) = child.child_by_field_name("function")
1158                            && func.kind() == "scoped_identifier"
1159                        {
1160                            // scoped_identifier path: `Foo::new` — extract head segment.
1161                            let full = source[func.start_byte()..func.end_byte()].to_string();
1162                            let head = full.split("::").next().unwrap_or("").to_string();
1163                            if !head.is_empty()
1164                                && head.chars().next().is_some_and(char::is_uppercase)
1165                            {
1166                                constructor_type = Some(head);
1167                            }
1168                        }
1169                    }
1170                    _ => {}
1171                }
1172            }
1173            if let (Some(name), Some(ty)) = (binding_name, constructor_type) {
1174                bindings.insert(name, ty);
1175            }
1176        }
1177        // Push children for recursive walk.
1178        let mut cursor = n.walk();
1179        for child in n.children(&mut cursor) {
1180            stack.push(child);
1181        }
1182    }
1183
1184    bindings
1185}
1186
1187/// Walk a function body and annotate method-call byte offsets with receiver types.
1188///
1189/// A "method call" in the ripvec call query is:
1190/// `(call_expression function: (field_expression field: (field_identifier) @callee))`
1191///
1192/// The receiver is the `value` child of `field_expression`. This function
1193/// checks whether the receiver is:
1194/// - `self` → use `impl_ctx` type.
1195/// - An identifier matching a parameter type in `param_types`.
1196/// - An identifier matching a constructor let-binding in `let_types`.
1197fn annotate_method_calls(
1198    source: &str,
1199    fn_node: tree_sitter::Node<'_>,
1200    impl_ctx: Option<&str>,
1201    param_types: &HashMap<String, String>,
1202    let_types: &HashMap<String, String>,
1203    map: &mut HashMap<u32, String>,
1204) {
1205    // Walk the entire function (including its body) looking for call_expression nodes.
1206    let mut stack = vec![fn_node];
1207    while let Some(n) = stack.pop() {
1208        if n.kind() == "call_expression"
1209            && let Some(func) = n.child_by_field_name("function")
1210            && func.kind() == "field_expression"
1211        {
1212            // field_expression: value (receiver) + field (method name identifier)
1213            if let (Some(recv), Some(field)) = (
1214                func.child_by_field_name("value"),
1215                func.child_by_field_name("field"),
1216            ) {
1217                let recv_text = source[recv.start_byte()..recv.end_byte()].to_string();
1218                let receiver_type = if recv_text == "self" || recv_text == "*self" {
1219                    impl_ctx.map(str::to_owned)
1220                } else {
1221                    // Strip ref sigils for lookup.
1222                    let base = recv_text
1223                        .trim_start_matches('*')
1224                        .trim_start_matches('&')
1225                        .trim();
1226                    param_types
1227                        .get(base)
1228                        .or_else(|| let_types.get(base))
1229                        .cloned()
1230                };
1231
1232                if let Some(ty) = receiver_type {
1233                    // The `@callee` capture byte offset is the start of the field node.
1234                    #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
1235                    let field_byte = field.start_byte() as u32;
1236                    map.insert(field_byte, ty);
1237                }
1238            }
1239        }
1240        let mut cursor = n.walk();
1241        for child in n.children(&mut cursor) {
1242            stack.push(child);
1243        }
1244    }
1245}
1246
1247// ── Python receiver-type heuristic ───────────────────────────────────
1248
1249/// Walk the Python parse tree and fill `map` with receiver-type inference.
1250///
1251/// Two heuristic cases:
1252///
1253/// 1. **`self.method()` inside a method** — when the `attribute` call receiver is
1254///    the literal text `self`, the receiver type is the name of the nearest
1255///    enclosing `class_definition`.
1256///
1257/// 2. **`instance.method()` with a type annotation or constructor call** —
1258///    when a function parameter has a PEP 484 annotation `param: ClassName` or
1259///    when a local assignment `param = ClassName(...)` precedes the call, the
1260///    receiver type is bound to `ClassName`.
1261///
1262/// The Python call query captures:
1263/// - `(call function: (attribute attribute: (identifier) @callee)) @call`
1264///
1265/// Within the `attribute` node, `value` is the receiver expression and
1266/// `attribute` is the method name (the `@callee` capture). The `@callee`
1267/// byte offset is the start of the `attribute` child identifier node.
1268fn collect_python_receiver_types(
1269    source: &str,
1270    root: tree_sitter::Node<'_>,
1271    map: &mut HashMap<u32, String>,
1272) {
1273    // Stack carries (node, class_ctx: Option<String>).
1274    // class_ctx is the name of the nearest enclosing class_definition.
1275    let mut stack: Vec<(tree_sitter::Node<'_>, Option<String>)> = vec![(root, None)];
1276
1277    while let Some((n, class_ctx)) = stack.pop() {
1278        match n.kind() {
1279            "class_definition" => {
1280                // Extract the class name from the `name` child.
1281                let class_name = n
1282                    .child_by_field_name("name")
1283                    .map(|c| source[c.start_byte()..c.end_byte()].to_string());
1284                let new_ctx = class_name.or_else(|| class_ctx.clone());
1285                let mut cursor = n.walk();
1286                for child in n.children(&mut cursor) {
1287                    stack.push((child, new_ctx.clone()));
1288                }
1289            }
1290            "function_definition" => {
1291                // Build parameter annotation map: param_name → type_name.
1292                let param_types = extract_python_param_types(source, n);
1293                // Build local assignment map: var_name → constructor type.
1294                let let_types = extract_python_assignment_types(source, n);
1295                // Annotate attribute call sites within this function body.
1296                annotate_python_method_calls(
1297                    source,
1298                    n,
1299                    class_ctx.as_deref(),
1300                    &param_types,
1301                    &let_types,
1302                    map,
1303                );
1304                // Push children with same class_ctx so nested classes are found.
1305                let mut cursor = n.walk();
1306                for child in n.children(&mut cursor) {
1307                    stack.push((child, class_ctx.clone()));
1308                }
1309            }
1310            _ => {
1311                let mut cursor = n.walk();
1312                for child in n.children(&mut cursor) {
1313                    stack.push((child, class_ctx.clone()));
1314                }
1315            }
1316        }
1317    }
1318}
1319
1320/// Extract Python parameter name → type annotation mappings.
1321///
1322/// Handles PEP 484 style: `def foo(self, x: Bar, y: Baz) -> ...`.
1323/// The `self` parameter is excluded (handled via class_ctx).
1324/// Returns `{"x": "Bar", "y": "Baz"}`.
1325fn extract_python_param_types(
1326    source: &str,
1327    fn_node: tree_sitter::Node<'_>,
1328) -> HashMap<String, String> {
1329    let mut params: HashMap<String, String> = HashMap::new();
1330    let Some(params_node) = fn_node.child_by_field_name("parameters") else {
1331        return params;
1332    };
1333
1334    // Parameters node children include `identifier`, `typed_parameter`,
1335    // `typed_default_parameter`, and others.
1336    let mut cursor = params_node.walk();
1337    for param in params_node.children(&mut cursor) {
1338        match param.kind() {
1339            "typed_parameter" => {
1340                // (typed_parameter (identifier) @name type: (type) @type)
1341                // First identifier child is the name; type child is the type.
1342                let mut name_text = None;
1343                let mut type_text = None;
1344                let mut pc = param.walk();
1345                for child in param.children(&mut pc) {
1346                    match child.kind() {
1347                        "identifier" if name_text.is_none() => {
1348                            let t = source[child.start_byte()..child.end_byte()].to_string();
1349                            if t != "self" && t != "cls" {
1350                                name_text = Some(t);
1351                            }
1352                        }
1353                        "type" | "identifier" | "attribute"
1354                            if type_text.is_none() && name_text.is_some() =>
1355                        {
1356                            // The type child in tree-sitter-python is a `type` node
1357                            // whose text is the annotation expression. Extract the
1358                            // base identifier (handle `Optional[Bar]`, `List[Bar]`, etc.)
1359                            type_text = Some(extract_python_base_type(source, child));
1360                        }
1361                        _ => {}
1362                    }
1363                }
1364                if let (Some(name), Some(ty)) = (name_text, type_text)
1365                    && !ty.is_empty()
1366                    && !ty.eq("self")
1367                    && !ty.eq("cls")
1368                {
1369                    params.insert(name, ty);
1370                }
1371            }
1372            "typed_default_parameter" => {
1373                // (typed_default_parameter name: (identifier) type: (type) value: …)
1374                let name_node = param.child_by_field_name("name");
1375                let type_node = param.child_by_field_name("type");
1376                if let (Some(nn), Some(tn)) = (name_node, type_node) {
1377                    let name = source[nn.start_byte()..nn.end_byte()].to_string();
1378                    if name != "self" && name != "cls" {
1379                        let ty = extract_python_base_type(source, tn);
1380                        if !ty.is_empty() {
1381                            params.insert(name, ty);
1382                        }
1383                    }
1384                }
1385            }
1386            _ => {}
1387        }
1388    }
1389    params
1390}
1391
1392/// Extract the base type name from a Python type annotation node.
1393///
1394/// For `Bar` → `"Bar"`. For `Optional[Bar]` or `List[Bar]` → `"Bar"`.
1395/// For `module.Class` → `"Class"` (bare name only).
1396fn extract_python_base_type(source: &str, node: tree_sitter::Node<'_>) -> String {
1397    match node.kind() {
1398        "identifier" => source[node.start_byte()..node.end_byte()].to_string(),
1399        // tree-sitter-python wraps annotations in a `type` node
1400        "type" => {
1401            let mut cursor = node.walk();
1402            for child in node.children(&mut cursor) {
1403                let t = extract_python_base_type(source, child);
1404                if !t.is_empty() {
1405                    return t;
1406                }
1407            }
1408            String::new()
1409        }
1410        // Generic alias: `Optional[Bar]` — the first identifier child is `Optional`,
1411        // the subscript child contains `Bar`. We want the subscript content.
1412        "subscript" => {
1413            // subscript has value (e.g. Optional) and subscript (e.g. Bar).
1414            // Return the subscript's base type (the inner type argument).
1415            if let Some(sub) = node.child_by_field_name("subscript") {
1416                return extract_python_base_type(source, sub);
1417            }
1418            // Fall back: first identifier
1419            let mut cursor = node.walk();
1420            for child in node.children(&mut cursor) {
1421                if child.kind() == "identifier" {
1422                    return source[child.start_byte()..child.end_byte()].to_string();
1423                }
1424            }
1425            String::new()
1426        }
1427        // Attribute node `module.Class` → take last identifier
1428        "attribute" => {
1429            if let Some(attr) = node.child_by_field_name("attribute") {
1430                return source[attr.start_byte()..attr.end_byte()].to_string();
1431            }
1432            String::new()
1433        }
1434        _ => {
1435            // Try first identifier child
1436            let mut cursor = node.walk();
1437            for child in node.children(&mut cursor) {
1438                if child.kind() == "identifier" {
1439                    return source[child.start_byte()..child.end_byte()].to_string();
1440                }
1441            }
1442            String::new()
1443        }
1444    }
1445}
1446
1447/// Scan a Python function body for `x = ClassName(...)` assignment patterns.
1448///
1449/// Returns a map from local variable name to constructor type.
1450/// E.g., `x = Foo()` → `{"x": "Foo"}`.
1451/// Also handles `x = module.ClassName(...)` → `{"x": "ClassName"}`.
1452fn extract_python_assignment_types(
1453    source: &str,
1454    fn_node: tree_sitter::Node<'_>,
1455) -> HashMap<String, String> {
1456    let mut bindings: HashMap<String, String> = HashMap::new();
1457    let Some(body) = fn_node.child_by_field_name("body") else {
1458        return bindings;
1459    };
1460
1461    let mut stack = vec![body];
1462    while let Some(n) = stack.pop() {
1463        if n.kind() == "assignment" {
1464            // assignment: left = right
1465            // We want: left is a simple identifier, right is a call whose
1466            // function is an identifier starting with an uppercase letter
1467            // (Python convention for class names).
1468            let left = n.child_by_field_name("left");
1469            let right = n.child_by_field_name("right");
1470            if let (Some(lhs), Some(rhs)) = (left, right)
1471                && lhs.kind() == "identifier"
1472                && rhs.kind() == "call"
1473                && let Some(func) = rhs.child_by_field_name("function")
1474            {
1475                let var_name = source[lhs.start_byte()..lhs.end_byte()].to_string();
1476                let constructor_type = match func.kind() {
1477                    "identifier" => {
1478                        let t = source[func.start_byte()..func.end_byte()].to_string();
1479                        // Class names are conventionally uppercase-first
1480                        if t.chars().next().is_some_and(char::is_uppercase) {
1481                            Some(t)
1482                        } else {
1483                            None
1484                        }
1485                    }
1486                    "attribute" => {
1487                        // `module.ClassName(...)` — take the `attribute` part
1488                        func.child_by_field_name("attribute")
1489                            .map(|a| source[a.start_byte()..a.end_byte()].to_string())
1490                    }
1491                    _ => None,
1492                };
1493                if let Some(ty) = constructor_type {
1494                    bindings.insert(var_name, ty);
1495                }
1496            }
1497        }
1498        let mut cursor = n.walk();
1499        for child in n.children(&mut cursor) {
1500            stack.push(child);
1501        }
1502    }
1503    bindings
1504}
1505
1506/// Walk a Python function body and annotate attribute-call byte offsets.
1507///
1508/// A Python method call is:
1509/// `(call function: (attribute value: <receiver> attribute: (identifier) @callee))`
1510///
1511/// The receiver is the `value` child of `attribute`. This function checks:
1512/// - `self` → use the enclosing class name (`class_ctx`).
1513/// - An identifier matching a parameter type in `param_types`.
1514/// - An identifier matching a constructor assignment in `let_types`.
1515fn annotate_python_method_calls(
1516    source: &str,
1517    fn_node: tree_sitter::Node<'_>,
1518    class_ctx: Option<&str>,
1519    param_types: &HashMap<String, String>,
1520    let_types: &HashMap<String, String>,
1521    map: &mut HashMap<u32, String>,
1522) {
1523    let mut stack = vec![fn_node];
1524    while let Some(n) = stack.pop() {
1525        if n.kind() == "call"
1526            && let Some(func) = n.child_by_field_name("function")
1527            && func.kind() == "attribute"
1528            && let (Some(recv_node), Some(attr_node)) = (
1529                func.child_by_field_name("object"),
1530                func.child_by_field_name("attribute"),
1531            )
1532        {
1533            // attribute node: object (receiver) + attribute (method name)
1534            let recv_text = source[recv_node.start_byte()..recv_node.end_byte()].to_string();
1535            let receiver_type = if recv_text == "self" || recv_text == "cls" {
1536                class_ctx.map(str::to_owned)
1537            } else if recv_node.kind() == "identifier" {
1538                param_types
1539                    .get(&recv_text)
1540                    .or_else(|| let_types.get(&recv_text))
1541                    .cloned()
1542            } else {
1543                None
1544            };
1545
1546            if let Some(ty) = receiver_type {
1547                // The `@callee` capture byte offset is the `attribute` child.
1548                #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
1549                let attr_byte = attr_node.start_byte() as u32;
1550                map.insert(attr_byte, ty);
1551            }
1552        }
1553        let mut cursor = n.walk();
1554        for child in n.children(&mut cursor) {
1555            stack.push(child);
1556        }
1557    }
1558}
1559
1560// ── Python class hierarchy (MRO) extraction ───────────────────────────
1561
1562/// Walk the Python parse tree and add class → parent-names entries to `out`.
1563///
1564/// tree-sitter-python shape:
1565/// ```text
1566/// (class_definition
1567///   name: (identifier) @child
1568///   superclasses: (argument_list
1569///     (identifier) @parent             ; bare parent: class Foo(Bar):
1570///     (attribute attribute: (identifier) @parent) ; qualified: class Foo(mod.Bar):
1571///     (keyword_argument …)             ; ignored: class Foo(Bar, metaclass=Meta):
1572///   )?
1573///   ...)
1574/// ```
1575///
1576/// Each map entry's key is a class defined in the file; the value is the
1577/// ordered list of declared parent class **names** (the trailing `attribute`
1578/// segment, so `mod.Bar` becomes `Bar`). Classes with no declared parents
1579/// still get an entry (empty `Vec`) so a downstream MRO walk can tell
1580/// "known class with no parents" from "unknown class".
1581fn extract_python_class_hierarchy_node(
1582    source: &str,
1583    root: tree_sitter::Node<'_>,
1584    out: &mut HashMap<String, Vec<String>>,
1585) {
1586    let mut stack = vec![root];
1587    while let Some(n) = stack.pop() {
1588        if n.kind() == "class_definition"
1589            && let Some(name_node) = n.child_by_field_name("name")
1590        {
1591            let class_name = source[name_node.start_byte()..name_node.end_byte()].to_string();
1592            let mut parents: Vec<String> = Vec::new();
1593            if let Some(superclasses) = n.child_by_field_name("superclasses") {
1594                // superclasses is an `argument_list`; iterate its children and
1595                // collect identifiers / attribute trailing-names. Skip
1596                // keyword_argument entries (metaclass=…, etc.) and punctuation.
1597                let mut sc = superclasses.walk();
1598                for child in superclasses.children(&mut sc) {
1599                    match child.kind() {
1600                        "identifier" => {
1601                            let t = source[child.start_byte()..child.end_byte()].to_string();
1602                            parents.push(t);
1603                        }
1604                        "attribute" => {
1605                            // module.Cls → take the trailing `attribute` segment.
1606                            if let Some(attr) = child.child_by_field_name("attribute") {
1607                                parents
1608                                    .push(source[attr.start_byte()..attr.end_byte()].to_string());
1609                            }
1610                        }
1611                        // Drop keyword_argument, "(", ")", ",", comments, etc.
1612                        _ => {}
1613                    }
1614                }
1615            }
1616            out.insert(class_name, parents);
1617        }
1618        let mut cursor = n.walk();
1619        for child in n.children(&mut cursor) {
1620            stack.push(child);
1621        }
1622    }
1623}
1624
1625/// Extract the Python `class → [parents]` map from a single source file by
1626/// parsing it with tree-sitter-python.
1627///
1628/// Returns an empty map when the source fails to parse or contains no
1629/// `class_definition` nodes. The returned map is the per-file contribution
1630/// to the global hierarchy used by [`resolve_calls_with_python_mro_pub`]
1631/// for MRO-aware receiver-type dispatch (Q1, Wave 2).
1632#[must_use]
1633pub fn extract_python_class_hierarchy(source: &str) -> HashMap<String, Vec<String>> {
1634    let mut parser = Parser::new();
1635    let lang: tree_sitter::Language = tree_sitter_python::LANGUAGE.into();
1636    if parser.set_language(&lang).is_err() {
1637        return HashMap::new();
1638    }
1639    let Some(tree) = parser.parse(source, None) else {
1640        return HashMap::new();
1641    };
1642    let mut out: HashMap<String, Vec<String>> = HashMap::new();
1643    extract_python_class_hierarchy_node(source, tree.root_node(), &mut out);
1644    out
1645}
1646
1647/// Compute the linearised MRO (Method Resolution Order) for a Python class
1648/// name using a **simplified left-first depth-first walk** of the declared
1649/// `class → [parents]` hierarchy.
1650///
1651/// Python's real MRO uses C3 linearisation, which is monotonic and respects
1652/// declaration order across the diamond inheritance shape. For ripvec's
1653/// reverse-call-graph purpose we want *any plausible ancestor* of the
1654/// receiver type — including ancestors only reachable via a mixin — so we
1655/// can resolve `self.method()` calls whose dispatch lands on an ancestor.
1656/// The simplification: pre-order DFS, left-to-right, skipping cycles via a
1657/// `visited` set.
1658///
1659/// On a non-diamond shape this matches C3 exactly. On a diamond the
1660/// simplified walk may surface an ancestor earlier than C3 would, but every
1661/// ancestor C3 would visit is still reached — and ripvec's goal is "find
1662/// the implementing def for an inherited call", not "compute the runtime
1663/// dispatch winner". Over-approximating ancestors only matters when two
1664/// ancestors define the same method, and even then the left-first order
1665/// matches C3 on the common patterns
1666/// (`class Sub(Base, Mixin)` → `Sub, Base, Mixin, <Base's ancestors>, <Mixin's ancestors>`).
1667///
1668/// The returned list excludes the start class itself. Each entry appears at
1669/// most once even when reachable through multiple parent chains.
1670fn compute_python_mro<H: std::hash::BuildHasher>(
1671    start: &str,
1672    hierarchy: &HashMap<String, Vec<String>, H>,
1673) -> Vec<String> {
1674    use std::collections::HashSet;
1675    let mut order: Vec<String> = Vec::new();
1676    let mut visited: HashSet<String> = HashSet::new();
1677    // Start with the immediate parents of `start` (the receiver-type's own
1678    // scope was already searched by Priority 2's direct match).
1679    let Some(start_parents) = hierarchy.get(start) else {
1680        return order;
1681    };
1682    // DFS stack: we push in reverse so pop yields left-first order.
1683    let mut stack: Vec<String> = start_parents.iter().rev().cloned().collect();
1684    while let Some(cls) = stack.pop() {
1685        if !visited.insert(cls.clone()) {
1686            continue;
1687        }
1688        order.push(cls.clone());
1689        if let Some(parents) = hierarchy.get(&cls) {
1690            for p in parents.iter().rev() {
1691                if !visited.contains(p) {
1692                    stack.push(p.clone());
1693                }
1694            }
1695        }
1696    }
1697    order
1698}
1699
1700// ── Go receiver-type heuristic ────────────────────────────────────────
1701
1702/// Walk the Go parse tree and fill `map` with receiver-type inference.
1703///
1704/// One heuristic case: **`recv.Method()` inside a `method_declaration`**.
1705///
1706/// Go methods have an explicit receiver parameter in their signature:
1707/// `func (r *Foo) Bar() { r.Baz() }` — `r` is bound to type `Foo`.
1708///
1709/// The Go call query captures:
1710/// `(call_expression function: (selector_expression field: (field_identifier) @callee))`
1711///
1712/// Within `selector_expression`, `operand` is the receiver expression and
1713/// `field` is the method name (the `@callee` capture).
1714///
1715/// This function also handles `self.method()` patterns for cases where code
1716/// uses `self` as a receiver name (not idiomatic Go, but it occurs).
1717fn collect_go_receiver_types(
1718    source: &str,
1719    root: tree_sitter::Node<'_>,
1720    map: &mut HashMap<u32, String>,
1721) {
1722    // Stack carries (node, receiver_binding: Option<(recv_name, recv_type)>).
1723    let mut stack: Vec<(tree_sitter::Node<'_>, Option<(String, String)>)> = vec![(root, None)];
1724
1725    while let Some((n, recv_binding)) = stack.pop() {
1726        if n.kind() == "method_declaration" {
1727            // Extract the receiver name and type from the method signature.
1728            let binding = extract_go_receiver_binding(source, n);
1729            let new_binding = binding.or_else(|| recv_binding.clone());
1730            let mut cursor = n.walk();
1731            for child in n.children(&mut cursor) {
1732                stack.push((child, new_binding.clone()));
1733            }
1734        } else {
1735            // For any call_expression whose function is a selector_expression,
1736            // check if the operand matches the active receiver binding.
1737            if n.kind() == "call_expression"
1738                && let Some(func) = n.child_by_field_name("function")
1739                && func.kind() == "selector_expression"
1740                && let (Some(operand), Some(field)) = (
1741                    func.child_by_field_name("operand"),
1742                    func.child_by_field_name("field"),
1743                )
1744            {
1745                let recv_text = source[operand.start_byte()..operand.end_byte()].to_string();
1746                let receiver_type = recv_binding.as_ref().and_then(|(recv_name, recv_ty)| {
1747                    if recv_text == *recv_name {
1748                        Some(recv_ty.clone())
1749                    } else {
1750                        None
1751                    }
1752                });
1753
1754                if let Some(ty) = receiver_type {
1755                    // The `@callee` capture byte offset is the `field` child.
1756                    #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
1757                    let field_byte = field.start_byte() as u32;
1758                    map.insert(field_byte, ty);
1759                }
1760            }
1761
1762            let mut cursor = n.walk();
1763            for child in n.children(&mut cursor) {
1764                stack.push((child, recv_binding.clone()));
1765            }
1766        }
1767    }
1768}
1769
1770/// Extract the receiver name and base type from a Go `method_declaration`.
1771///
1772/// Go method declaration shape (tree-sitter-go):
1773/// ```text
1774/// (method_declaration
1775///   receiver: (parameter_list
1776///     (parameter_declaration
1777///       name: (identifier)       ← receiver name
1778///       type: (type_identifier   ← receiver type (bare)
1779///            | pointer_type (type_identifier)) ← or *Type
1780///     )
1781///   )
1782///   name: (field_identifier)
1783///   ...
1784/// )
1785/// ```
1786///
1787/// Returns `Some((recv_name, type_name))` or `None` if the receiver is unnamed
1788/// (blank identifier `_`) or has an unrecognisable shape.
1789fn extract_go_receiver_binding(
1790    source: &str,
1791    method_node: tree_sitter::Node<'_>,
1792) -> Option<(String, String)> {
1793    let receiver_list = method_node.child_by_field_name("receiver")?;
1794    // parameter_list contains one parameter_declaration
1795    let mut cursor = receiver_list.walk();
1796    for param in receiver_list.children(&mut cursor) {
1797        if param.kind() == "parameter_declaration" {
1798            let name_node = param.child_by_field_name("name");
1799            let type_node = param.child_by_field_name("type");
1800            if let (Some(nn), Some(tn)) = (name_node, type_node) {
1801                let name = source[nn.start_byte()..nn.end_byte()].to_string();
1802                if name == "_" || name.is_empty() {
1803                    return None;
1804                }
1805                let ty = extract_go_base_type(source, tn);
1806                if !ty.is_empty() {
1807                    return Some((name, ty));
1808                }
1809            }
1810        }
1811    }
1812    None
1813}
1814
1815/// Extract the base type name from a Go type node.
1816///
1817/// For `Foo` (type_identifier) → `"Foo"`.
1818/// For `*Foo` (pointer_type → type_identifier) → `"Foo"`.
1819fn extract_go_base_type(source: &str, node: tree_sitter::Node<'_>) -> String {
1820    match node.kind() {
1821        "type_identifier" => source[node.start_byte()..node.end_byte()].to_string(),
1822        "pointer_type" => {
1823            // pointer_type has one child: the pointee type
1824            let mut cursor = node.walk();
1825            for child in node.children(&mut cursor) {
1826                if child.kind() == "type_identifier" {
1827                    return source[child.start_byte()..child.end_byte()].to_string();
1828                }
1829                let t = extract_go_base_type(source, child);
1830                if !t.is_empty() {
1831                    return t;
1832                }
1833            }
1834            String::new()
1835        }
1836        _ => {
1837            let mut cursor = node.walk();
1838            for child in node.children(&mut cursor) {
1839                if child.kind() == "type_identifier" {
1840                    return source[child.start_byte()..child.end_byte()].to_string();
1841                }
1842            }
1843            String::new()
1844        }
1845    }
1846}
1847
1848/// Enrich Go `method_declaration` definition scopes with their receiver type name.
1849///
1850/// In the generic `extract_definitions` path, `build_scope_chain` walks the
1851/// *parent* chain of the `@def` node. For Go `method_declaration`, the parent
1852/// is the file root — so the scope is always `""`.
1853///
1854/// An empty scope means `resolve_calls` Priority 2 (receiver-type matching via
1855/// `scope.contains(recv_type)`) never fires for Go methods. Cross-file calls
1856/// where the caller inferred `receiver_type = Some("Foo")` stay unresolved;
1857/// no edge is recorded; `def_callers[]` stays empty for those defs — the root
1858/// cause of the missing inverse index for Go (I#P1).
1859///
1860/// Fix: after `extract_definitions`, parse the Go source a second time to find
1861/// each `method_declaration`'s receiver type, then set the matching def's scope
1862/// to `"method_declaration {ReceiverType}"`.  This matches the pattern used by
1863/// the existing `go_resolve_receiver_method_via_signature` integration test,
1864/// which asserts that `scope.contains("Foo")` succeeds when the scope is
1865/// `"method_declaration Foo"`.
1866///
1867/// Matching is by `start_byte` (precise) so name collisions across different
1868/// receiver types are handled correctly.
1869fn enrich_go_method_def_scopes(source: &str, defs: &mut [Definition]) {
1870    let go_lang: tree_sitter::Language = tree_sitter_go::LANGUAGE.into();
1871    let mut parser = Parser::new();
1872    if parser.set_language(&go_lang).is_err() {
1873        return;
1874    }
1875    let Some(tree) = parser.parse(source, None) else {
1876        return;
1877    };
1878
1879    // Walk all top-level method_declaration nodes.
1880    let root = tree.root_node();
1881    let mut method_cursor = root.walk();
1882    for child in root.children(&mut method_cursor) {
1883        if child.kind() != "method_declaration" {
1884            continue;
1885        }
1886        let Some((_, recv_type)) = extract_go_receiver_binding(source, child) else {
1887            continue;
1888        };
1889        // Match by start_byte (precise): the @def node for method_declaration in
1890        // the Go definition query is the method_declaration node itself, so its
1891        // start_byte matches the def's start_byte recorded during extract_definitions.
1892        #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
1893        let method_start_byte = child.start_byte() as u32;
1894        for def in defs.iter_mut() {
1895            if def.kind == "method_declaration" && def.start_byte == method_start_byte {
1896                def.scope = format!("method_declaration {recv_type}");
1897                break;
1898            }
1899        }
1900    }
1901}
1902
1903/// Public wrapper for `enrich_go_method_def_scopes` — enables integration tests
1904/// to call it directly without going through the full `build_graph` pipeline.
1905pub fn enrich_go_method_def_scopes_pub(source: &str, defs: &mut [Definition]) {
1906    enrich_go_method_def_scopes(source, defs);
1907}
1908
1909/// SQL: prepend a synthetic whole-file definition whose name is the filename stem.
1910///
1911/// dbt and sqlmesh follow a filename-as-model-name convention:
1912///
1913/// - `silver_issuer_returns.sql` defines the `silver_issuer_returns` model.
1914/// - `gold_issuer_returns.sql` references the silver model by filename stem,
1915///   not by any in-source CREATE TABLE.
1916///
1917/// In sqlmesh, the in-source name is templated:
1918///
1919/// ```sql
1920/// MODEL (
1921///   name @{athena_sqlmesh_silver_schema}.issuer_returns,
1922///   ...
1923/// );
1924/// SELECT ... FROM @{athena_sqlmesh_silver_schema}.stg_issuer_returns;
1925/// ```
1926///
1927/// The `MODEL (...)` header parses as an ERROR node under tree-sitter-sequel
1928/// because `@{var}` is not standard SQL; FROM/JOIN further down the file still
1929/// extract cleanly. Without a synthetic def, `lsp_workspace_symbols(query=
1930/// "silver_issuer_returns")` returns no hits — there is no real CREATE TABLE
1931/// in the file and the model name is interpolation only.
1932///
1933/// This helper prepends a definition with:
1934/// - `name` = filename stem (e.g., `silver_issuer_returns`)
1935/// - `kind` = `"sql_file"` (maps to `LSP SymbolKind::File` in
1936///   [`languages::lsp_symbol_kind_for_node_kind`])
1937/// - byte range = the entire source `[0, source.len())`
1938/// - scope / signature / qualified_name = empty / None
1939///
1940/// The whole-file byte range is the key to FROM/JOIN attribution: when
1941/// `extract_calls` later places a CallRef from a FROM clause that is not
1942/// inside any CTE or other smaller def, the smallest-enclosing-def search
1943/// lands on this synthetic file def and the edge is recorded.
1944///
1945/// If the filename has no stem (empty / `..`), the helper is a no-op.
1946/// Idempotent: if a `sql_file` def already exists at byte 0, it is left alone.
1947pub(crate) fn enrich_sql_file_def(filename: &str, source: &str, defs: &mut Vec<Definition>) {
1948    // Idempotency: do nothing if a sql_file def is already present at byte 0.
1949    if defs
1950        .iter()
1951        .any(|d| d.kind == "sql_file" && d.start_byte == 0)
1952    {
1953        return;
1954    }
1955
1956    // Derive the filename stem (last path component, file extension stripped).
1957    let stem = std::path::Path::new(filename)
1958        .file_stem()
1959        .and_then(|s| s.to_str())
1960        .unwrap_or_default();
1961    if stem.is_empty() {
1962        return;
1963    }
1964
1965    // Count newlines so end_line is reasonable for downstream UI.
1966    let end_line_zero_based = source.bytes().filter(|&b| b == b'\n').count();
1967    #[expect(clippy::cast_possible_truncation, reason = "line counts fit in u32")]
1968    let end_line = (end_line_zero_based as u32) + 1;
1969    #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
1970    let end_byte = source.len() as u32;
1971
1972    let file_def = Definition {
1973        name: stem.to_string(),
1974        kind: "sql_file".to_string(),
1975        start_line: 1,
1976        end_line,
1977        scope: String::new(),
1978        signature: None,
1979        start_byte: 0,
1980        end_byte,
1981        calls: vec![],
1982    };
1983    // Prepend so it remains the outermost (largest) enclosing def at byte 0,
1984    // ensuring narrow CTE defs are still preferred for inner-FROM attribution
1985    // (the smallest-enclosing rule in `extract_calls`).
1986    defs.insert(0, file_def);
1987}
1988
1989/// Public wrapper for [`enrich_sql_file_def`] — enables integration tests
1990/// to call it directly without going through the full `build_graph` pipeline.
1991///
1992/// This is `pub` (not `pub(crate)`) because integration tests in
1993/// `crates/ripvec-core/tests/` are in a separate crate and cannot access
1994/// `pub(crate)` items.
1995pub fn enrich_sql_file_def_pub(filename: &str, source: &str, defs: &mut Vec<Definition>) {
1996    enrich_sql_file_def(filename, source, defs);
1997}
1998
1999/// Public wrapper for `extract_calls` — enables integration tests to call it
2000/// directly without going through the full `build_graph` pipeline.
2001///
2002/// This is `pub` (not `pub(crate)`) because integration tests in
2003/// `crates/ripvec-core/tests/` are in a separate crate and cannot access
2004/// `pub(crate)` items.
2005pub fn extract_calls_pub(
2006    source: &str,
2007    call_config: &languages::CallConfig,
2008    defs: &mut [Definition],
2009) {
2010    extract_calls(source, call_config, defs);
2011}
2012
2013/// Build an index from definition name to list of `DefId`s.
2014#[must_use]
2015pub fn build_def_index_pub(files: &[FileNode]) -> HashMap<String, Vec<DefId>> {
2016    build_def_index(files)
2017}
2018
2019fn build_def_index(files: &[FileNode]) -> HashMap<String, Vec<DefId>> {
2020    let mut index: HashMap<String, Vec<DefId>> = HashMap::new();
2021    for (file_idx, file) in files.iter().enumerate() {
2022        for (def_idx, def) in file.defs.iter().enumerate() {
2023            #[expect(clippy::cast_possible_truncation)]
2024            let did: DefId = (file_idx as u32, def_idx as u16);
2025            index.entry(def.name.clone()).or_default().push(did);
2026        }
2027    }
2028    index
2029}
2030
2031/// Resolve call references to target definitions.
2032///
2033/// Resolution priority:
2034///
2035/// 1. **Qualified path** (`qualified_path = Some("mod_a::foo")`): filter candidates
2036///    by qualifier match (file path or scope contains the qualifier segment). Unique
2037///    match → resolve; ambiguous or no match → leave `None`.
2038/// 2. **Receiver type — direct scope match** (`receiver_type = Some("Foo")`):
2039///    for method calls, prefer candidates whose `scope` contains the receiver
2040///    type name (e.g., `"impl_item Foo"`). Among receiver-matching candidates,
2041///    further prefer those in imported files. Unique match → resolve;
2042///    ambiguous → leave `None`.
2043///    When this step finds nothing on the receiver class itself, sub-step 2b
2044///    (Python MRO walk) runs: when `Foo` has a recorded parent chain, walk
2045///    the receiver class's MRO (left-first DFS) and try the scope-match
2046///    against each ancestor's name. First ancestor with a matching candidate
2047///    wins. See [`compute_python_mro`] for the simplification rationale (it
2048///    diverges from C3 only on diamond shapes where two ancestors define the
2049///    same name).
2050/// 3. **Same file** (unqualified, no receiver): prefer definitions in the caller's
2051///    own file.
2052/// 4. **SQL suffix-match** (sql_file callers only, no exact-name match): when
2053///    the caller def has `kind = "sql_file"` and the bare `call_name` (e.g.,
2054///    `"issuer_returns"`) has no exact entry in the def index, scan all
2055///    `sql_file` defs for names ending with `_<call_name>` (e.g.,
2056///    `"silver_issuer_returns"`). This bridges dbt / sqlmesh layered schema
2057///    prefixes: `gold_issuer_returns.sql` uses `FROM @{schema}.issuer_returns`
2058///    which tree-sitter reduces to the bare name `"issuer_returns"`, while the
2059///    target def is the synthetic `sql_file` def named `"silver_issuer_returns"`.
2060///    Unique suffix-match → resolve; ambiguous (multiple layers match) or
2061///    no match → leave `None`. Non-sql_file callers are explicitly excluded.
2062/// 5. **Imported file** (unqualified, no receiver): check definitions in files this
2063///    file imports. Unique imported candidate → resolve.
2064/// 6. **Ambiguous or unresolved**: leave `resolved` as `None` (no silent first-wins).
2065///
2066/// Equivalent to [`resolve_calls_with_python_mro_pub`] with an empty MRO map
2067/// (Priority 2.5 is a no-op).
2068pub fn resolve_calls_pub<S: std::hash::BuildHasher>(
2069    files: &mut [FileNode],
2070    def_index: &HashMap<String, Vec<DefId>, S>,
2071) {
2072    let empty: HashMap<String, Vec<String>> = HashMap::new();
2073    resolve_calls_inner(files, def_index, &empty);
2074}
2075
2076/// Resolve call references with MRO-aware Python receiver dispatch enabled.
2077///
2078/// Identical to [`resolve_calls_pub`] except that Priority 2.5 (the MRO walk)
2079/// fires when the caller passes a non-empty `python_class_hierarchy`.
2080/// `build_graph` populates the hierarchy by parsing every Python source file
2081/// with [`extract_python_class_hierarchy`] and merging the per-file maps.
2082///
2083/// Tests that want to exercise MRO resolution without going through
2084/// `build_graph` can call this directly with a synthetic hierarchy.
2085pub fn resolve_calls_with_python_mro_pub<S, H>(
2086    files: &mut [FileNode],
2087    def_index: &HashMap<String, Vec<DefId>, S>,
2088    python_class_hierarchy: &HashMap<String, Vec<String>, H>,
2089) where
2090    S: std::hash::BuildHasher,
2091    H: std::hash::BuildHasher,
2092{
2093    resolve_calls_inner(files, def_index, python_class_hierarchy);
2094}
2095
2096fn resolve_calls<S, H>(
2097    files: &mut [FileNode],
2098    def_index: &HashMap<String, Vec<DefId>, S>,
2099    python_class_hierarchy: &HashMap<String, Vec<String>, H>,
2100) where
2101    S: std::hash::BuildHasher,
2102    H: std::hash::BuildHasher,
2103{
2104    resolve_calls_inner(files, def_index, python_class_hierarchy);
2105}
2106
2107#[expect(
2108    clippy::too_many_lines,
2109    reason = "7-priority resolution cascade (qualified path, receiver type, MRO walk, same-file, \
2110              SQL suffix-match, imported-file, ambiguous); each priority is a distinct decision \
2111              branch and extracting helpers would require passing large shared state across boundaries"
2112)]
2113fn resolve_calls_inner<S, H>(
2114    files: &mut [FileNode],
2115    def_index: &HashMap<String, Vec<DefId>, S>,
2116    python_class_hierarchy: &HashMap<String, Vec<String>, H>,
2117) where
2118    S: std::hash::BuildHasher,
2119    H: std::hash::BuildHasher,
2120{
2121    // Pre-compute imported file sets for each file.
2122    let imported_files: Vec<std::collections::HashSet<u32>> = files
2123        .iter()
2124        .map(|f| {
2125            f.imports
2126                .iter()
2127                .filter_map(|imp| imp.resolved_idx)
2128                .collect()
2129        })
2130        .collect();
2131
2132    for file_idx in 0..files.len() {
2133        for def_idx in 0..files[file_idx].defs.len() {
2134            for call_idx in 0..files[file_idx].defs[def_idx].calls.len() {
2135                let call_name = files[file_idx].defs[def_idx].calls[call_idx].name.clone();
2136                let qualified_path = files[file_idx].defs[def_idx].calls[call_idx]
2137                    .qualified_path
2138                    .clone();
2139                let receiver_type = files[file_idx].defs[def_idx].calls[call_idx]
2140                    .receiver_type
2141                    .clone();
2142
2143                // HCL: dedicated resolution for `terraform_remote_state.<NAME>`
2144                // and `module.<NAME>` qualified paths. Aurora's module DAG is
2145                // expressed by these patterns; resolve to the first def in any
2146                // file under a `/<NAME>/` directory segment. This is the
2147                // module-source contract (R2 + R3, Wave 3).
2148                if let Some(ref qpath) = qualified_path
2149                    && (qpath.starts_with("terraform_remote_state.")
2150                        || qpath.starts_with("module."))
2151                {
2152                    let target = &call_name; // already the bare module label
2153                    let segment_match = format!("/{target}/");
2154                    let alt_segment_prefix = format!("{target}/"); // when path starts with target dir
2155                    let candidate = files.iter().enumerate().find_map(|(idx, f)| {
2156                        if f.path.contains(&segment_match)
2157                            || f.path.starts_with(&alt_segment_prefix)
2158                        {
2159                            // Pick the first def in the file (or skip if file
2160                            // has no defs).
2161                            if !f.defs.is_empty() {
2162                                #[expect(
2163                                    clippy::cast_possible_truncation,
2164                                    reason = "file index fits in u32"
2165                                )]
2166                                {
2167                                    return Some((idx as u32, 0u16));
2168                                }
2169                            }
2170                        }
2171                        None
2172                    });
2173                    if let Some(did) = candidate {
2174                        files[file_idx].defs[def_idx].calls[call_idx].resolved = Some(did);
2175                    }
2176                    continue;
2177                }
2178
2179                // ── Priority 1: Qualified-path resolution ────────────────
2180                //
2181                // `qualified_path` carries the full scoped path (e.g. "mod_a::foo").
2182                // We look up candidates by the bare `call_name`, then filter by
2183                // whether the file path or scope contains the qualifier prefix.
2184                if let Some(ref qpath) = qualified_path {
2185                    // Qualifier is everything before the final `::`.
2186                    let qualifier = if let Some(pos) = qpath.rfind("::") {
2187                        &qpath[..pos]
2188                    } else {
2189                        qpath.as_str()
2190                    };
2191                    let qual_segments: Vec<&str> = qualifier.split("::").collect();
2192
2193                    let Some(candidates) = def_index.get(&call_name) else {
2194                        continue;
2195                    };
2196
2197                    let matching: Vec<DefId> = candidates
2198                        .iter()
2199                        .copied()
2200                        .filter(|&(f_idx, _)| {
2201                            let file_path = &files[f_idx as usize].path;
2202                            let last_segment = qual_segments.last().copied().unwrap_or("");
2203                            let path_as_module =
2204                                file_path.trim_end_matches(".rs").replace(['/', '\\'], "::");
2205                            path_as_module.contains(last_segment)
2206                                || file_path.contains(last_segment)
2207                        })
2208                        .collect();
2209
2210                    if matching.len() == 1 {
2211                        files[file_idx].defs[def_idx].calls[call_idx].resolved = Some(matching[0]);
2212                    }
2213                    // Ambiguous or no match → leave None.
2214                    continue;
2215                }
2216
2217                // ── Priority 3.5: SQL suffix-match resolution ────────────
2218                //
2219                // dbt / sqlmesh pipelines use layered schema prefixes:
2220                // `silver_issuer_returns.sql` defines the silver-layer model.
2221                // `gold_issuer_returns.sql` references it via a FROM clause
2222                // that tree-sitter parses as `name: "issuer_returns"` (the
2223                // `name:` field-selector strips the `@{schema}.` prefix).
2224                // `def_index.get("issuer_returns")` returns None — the def is
2225                // stored under "silver_issuer_returns".
2226                //
2227                // When: (a) no exact-name candidate exists, AND (b) the
2228                // caller's enclosing def is a sql_file (whole-file synthetic
2229                // def emitted by `enrich_sql_file_def`), walk every sql_file
2230                // def in the graph and check whether its name ends with
2231                // `_<call_name>`. Unique suffix-match → resolve. Ambiguous
2232                // (e.g., both gold_ and silver_ match the same bare name) →
2233                // leave None (no silent first-wins).
2234                //
2235                // Non-sql_file callers are explicitly excluded: a Rust
2236                // function_item whose call_name happens to end with a suffix
2237                // of some sql_file def must NOT be resolved via this path.
2238                if !def_index.contains_key(&call_name)
2239                    && files[file_idx].defs[def_idx].kind == "sql_file"
2240                    && !call_name.is_empty()
2241                {
2242                    let suffix = format!("_{call_name}");
2243                    let suffix_str = suffix.as_str();
2244                    // Exclude the caller def itself from the suffix scan:
2245                    // `gold_issuer_returns` also ends with `_issuer_returns`
2246                    // but must not self-resolve.
2247                    #[expect(clippy::cast_possible_truncation, reason = "file index fits in u32")]
2248                    let caller_did: DefId = (file_idx as u32, def_idx as u16);
2249                    let suffix_matches: Vec<DefId> = files
2250                        .iter()
2251                        .enumerate()
2252                        .flat_map(|(f_idx, f)| {
2253                            f.defs.iter().enumerate().filter_map(move |(d_idx, d)| {
2254                                #[expect(
2255                                    clippy::cast_possible_truncation,
2256                                    reason = "file and def indices fit in u32/u16"
2257                                )]
2258                                let did: DefId = (f_idx as u32, d_idx as u16);
2259                                if d.kind == "sql_file"
2260                                    && d.name.ends_with(suffix_str)
2261                                    && did != caller_did
2262                                {
2263                                    Some(did)
2264                                } else {
2265                                    None
2266                                }
2267                            })
2268                        })
2269                        .collect();
2270                    if suffix_matches.len() == 1 {
2271                        files[file_idx].defs[def_idx].calls[call_idx].resolved =
2272                            Some(suffix_matches[0]);
2273                    }
2274                    // Ambiguous (>1) or no match (0) → leave None.
2275                    continue;
2276                }
2277
2278                let Some(candidates) = def_index.get(&call_name) else {
2279                    continue;
2280                };
2281
2282                // ── Priority 2: Receiver-type resolution ─────────────────
2283                //
2284                // `receiver_type = Some("Foo")` means this is a method call on a
2285                // value whose type is `Foo`. Filter candidates to those whose scope
2286                // chain contains the receiver type name.
2287                if let Some(ref rtype) = receiver_type {
2288                    // Candidates whose scope contains the receiver type name.
2289                    let receiver_matching: Vec<DefId> = candidates
2290                        .iter()
2291                        .copied()
2292                        .filter(|&(f_idx, d_idx)| {
2293                            let scope = &files[f_idx as usize].defs[d_idx as usize].scope;
2294                            scope.contains(rtype.as_str())
2295                        })
2296                        .collect();
2297
2298                    if receiver_matching.len() == 1 {
2299                        files[file_idx].defs[def_idx].calls[call_idx].resolved =
2300                            Some(receiver_matching[0]);
2301                        continue;
2302                    }
2303
2304                    if receiver_matching.len() > 1 {
2305                        // Among receiver-matching candidates, prefer those in imported files.
2306                        let imported_receiver_matching: Vec<DefId> = receiver_matching
2307                            .iter()
2308                            .copied()
2309                            .filter(|(f, _)| imported_files[file_idx].contains(f))
2310                            .collect();
2311                        if imported_receiver_matching.len() == 1 {
2312                            files[file_idx].defs[def_idx].calls[call_idx].resolved =
2313                                Some(imported_receiver_matching[0]);
2314                        }
2315                        // Ambiguous even after import filter → leave None.
2316                        continue;
2317                    }
2318
2319                    // ── Priority 2.5: Python MRO walk ─────────────────────
2320                    //
2321                    // The receiver type's own scope has no matching def — but
2322                    // the method may live on an ancestor class. Walk the MRO
2323                    // (left-first DFS) and try the scope-match against each
2324                    // ancestor's name. First ancestor with at least one
2325                    // scope-matching candidate wins; if multiple candidates
2326                    // match for the same ancestor, prefer imported files,
2327                    // else take the first in stable order.
2328                    //
2329                    // Liskov: a subclass's `self.method()` call must dispatch
2330                    // through the MRO; over-approximating ancestors is the
2331                    // correct conservative move for a reverse call graph.
2332                    // For non-Python languages or Python receivers with no
2333                    // recorded parents, `compute_python_mro` returns an
2334                    // empty vector and this loop is a no-op.
2335                    let mro = compute_python_mro(rtype, python_class_hierarchy);
2336                    let mut resolved_via_mro: Option<DefId> = None;
2337                    for ancestor in &mro {
2338                        let ancestor_matching: Vec<DefId> = candidates
2339                            .iter()
2340                            .copied()
2341                            .filter(|&(f_idx, d_idx)| {
2342                                let scope = &files[f_idx as usize].defs[d_idx as usize].scope;
2343                                scope.contains(ancestor.as_str())
2344                            })
2345                            .collect();
2346                        if ancestor_matching.len() == 1 {
2347                            resolved_via_mro = Some(ancestor_matching[0]);
2348                            break;
2349                        }
2350                        if ancestor_matching.len() > 1 {
2351                            // Prefer imported files among the ancestor matches.
2352                            let imported_ancestor: Vec<DefId> = ancestor_matching
2353                                .iter()
2354                                .copied()
2355                                .filter(|(f, _)| imported_files[file_idx].contains(f))
2356                                .collect();
2357                            if imported_ancestor.len() == 1 {
2358                                resolved_via_mro = Some(imported_ancestor[0]);
2359                                break;
2360                            }
2361                            // Ambiguous at this ancestor — pick the first
2362                            // candidate in stable order. The MRO walk's job
2363                            // is to find *an* implementing def for an
2364                            // inherited call, not to compute the runtime
2365                            // dispatch winner; any plausible candidate is
2366                            // useful for the reverse call graph.
2367                            resolved_via_mro = Some(ancestor_matching[0]);
2368                            break;
2369                        }
2370                    }
2371                    if let Some(did) = resolved_via_mro {
2372                        files[file_idx].defs[def_idx].calls[call_idx].resolved = Some(did);
2373                        continue;
2374                    }
2375                    // No receiver-matching candidates anywhere in the MRO →
2376                    // fall through to bare-name resolution.
2377                }
2378
2379                // ── Priority 3: Same-file resolution ─────────────────────
2380                #[expect(clippy::cast_possible_truncation)]
2381                let file_idx_u32 = file_idx as u32;
2382                if let Some(&did) = candidates.iter().find(|(f, _)| *f == file_idx_u32) {
2383                    files[file_idx].defs[def_idx].calls[call_idx].resolved = Some(did);
2384                    continue;
2385                }
2386
2387                // ── Priority 4: Imported-file resolution ──────────────────
2388                let imported_candidates: Vec<DefId> = candidates
2389                    .iter()
2390                    .copied()
2391                    .filter(|(f, _)| imported_files[file_idx].contains(f))
2392                    .collect();
2393                if imported_candidates.len() == 1 {
2394                    files[file_idx].defs[def_idx].calls[call_idx].resolved =
2395                        Some(imported_candidates[0]);
2396                }
2397                // Priority 5: Ambiguous or unresolved → leave None.
2398            }
2399        }
2400    }
2401}
2402
2403/// Compute a prefix-sum offset table for flattening `DefId`s to linear indices.
2404fn def_offsets(files: &[FileNode]) -> Vec<usize> {
2405    let mut offsets = Vec::with_capacity(files.len() + 1);
2406    offsets.push(0);
2407    for file in files {
2408        offsets.push(offsets.last().unwrap() + file.defs.len());
2409    }
2410    offsets
2411}
2412
2413/// Flatten a `DefId` to a linear index using the offset table.
2414fn flatten_def_id(offsets: &[usize], did: DefId) -> usize {
2415    offsets[did.0 as usize] + did.1 as usize
2416}
2417
2418/// Build top-N caller and callee lists for each definition (flattened).
2419fn build_def_neighbor_lists(
2420    n: usize,
2421    edges: &[(u32, u32, u32)],
2422    offsets: &[usize],
2423) -> (Vec<Vec<DefId>>, Vec<Vec<DefId>>) {
2424    let mut incoming: Vec<Vec<(u32, u32)>> = vec![vec![]; n];
2425    let mut outgoing: Vec<Vec<(u32, u32)>> = vec![vec![]; n];
2426
2427    for &(src, dst, w) in edges {
2428        let (s, d) = (src as usize, dst as usize);
2429        if s < n && d < n {
2430            incoming[d].push((src, w));
2431            outgoing[s].push((dst, w));
2432        }
2433    }
2434
2435    // Convert flat index back to DefId
2436    let to_def_id = |flat: u32| -> DefId {
2437        let flat_usize = flat as usize;
2438        let file_idx = offsets.partition_point(|&o| o <= flat_usize) - 1;
2439        let def_idx = flat_usize - offsets[file_idx];
2440        #[expect(clippy::cast_possible_truncation)]
2441        (file_idx as u32, def_idx as u16)
2442    };
2443
2444    let callers = incoming
2445        .into_iter()
2446        .map(|mut v| {
2447            v.sort_by_key(|b| std::cmp::Reverse(b.1));
2448            v.truncate(MAX_NEIGHBORS);
2449            v.into_iter().map(|(idx, _)| to_def_id(idx)).collect()
2450        })
2451        .collect();
2452
2453    let callees = outgoing
2454        .into_iter()
2455        .map(|mut v| {
2456            v.sort_by_key(|b| std::cmp::Reverse(b.1));
2457            v.truncate(MAX_NEIGHBORS);
2458            v.into_iter().map(|(idx, _)| to_def_id(idx)).collect()
2459        })
2460        .collect();
2461
2462    (callers, callees)
2463}
2464
2465// ── PageRank ─────────────────────────────────────────────────────────
2466
2467/// Compute `PageRank` scores for a graph.
2468///
2469/// If `focus` is `Some(idx)`, computes topic-sensitive `PageRank` biased
2470/// toward file `idx`. Otherwise computes standard (uniform) `PageRank`.
2471///
2472/// Returns one score per node, summing to 1.0.
2473#[expect(
2474    clippy::cast_precision_loss,
2475    reason = "node count fits comfortably in f32"
2476)]
2477fn pagerank(n: usize, edges: &[(u32, u32, u32)], focus: Option<usize>) -> Vec<f32> {
2478    if n == 0 {
2479        return vec![];
2480    }
2481
2482    // Build adjacency: out_edges[src] = [(dst, weight)]
2483    let mut out_edges: Vec<Vec<(usize, f32)>> = vec![vec![]; n];
2484    let mut out_weight: Vec<f32> = vec![0.0; n];
2485
2486    for &(src, dst, w) in edges {
2487        let (s, d) = (src as usize, dst as usize);
2488        if s < n && d < n {
2489            #[expect(clippy::cast_possible_truncation, reason = "edge weights are small")]
2490            let wf = f64::from(w) as f32;
2491            out_edges[s].push((d, wf));
2492            out_weight[s] += wf;
2493        }
2494    }
2495
2496    // Personalization vector (Haveliwala 2002, topic-sensitive PageRank).
2497    //
2498    // When a focus file is specified, the teleportation distribution is split:
2499    //   - PERSONALIZATION_ALPHA (0.15) concentrated on the focus node.
2500    //   - (1 - PERSONALIZATION_ALPHA) = 0.85 spread uniformly over the
2501    //     remaining (n - 1) other nodes.
2502    //
2503    // This gives the focus file a gentle bias over its neighbors without
2504    // collapsing every other file to an equal uniform floor. The resulting
2505    // ranks still vary across the corpus, so the caller sees a *neighborhood*
2506    // of semantically related files rebiased toward the focus (I#16 fix).
2507    //
2508    // For n == 1 there are no other nodes; the focus gets all mass (= 1.0).
2509    let bias: Vec<f32> = if let Some(idx) = focus {
2510        if n == 1 {
2511            vec![1.0_f32]
2512        } else {
2513            let other_mass = (1.0_f32 - PERSONALIZATION_ALPHA) / (n as f32 - 1.0);
2514            let mut b = vec![other_mass; n];
2515            if idx < n {
2516                b[idx] = PERSONALIZATION_ALPHA;
2517            }
2518            // Verify sum ≈ 1.0 (should hold by construction; normalization
2519            // guards against floating-point drift on very large graphs).
2520            let sum: f32 = b.iter().sum();
2521            for v in &mut b {
2522                *v /= sum;
2523            }
2524            b
2525        }
2526    } else {
2527        vec![1.0 / n as f32; n]
2528    };
2529
2530    let mut rank = vec![1.0 / n as f32; n];
2531    let mut next_rank = vec![0.0_f32; n];
2532
2533    for _ in 0..MAX_ITERATIONS {
2534        // Collect dangling mass (nodes with no outgoing edges)
2535        let dangling: f32 = rank
2536            .iter()
2537            .enumerate()
2538            .filter(|&(i, _)| out_edges[i].is_empty())
2539            .map(|(_, &r)| r)
2540            .sum();
2541
2542        // Distribute rank
2543        for (i, nr) in next_rank.iter_mut().enumerate() {
2544            *nr = (1.0 - DAMPING).mul_add(bias[i], DAMPING * dangling * bias[i]);
2545        }
2546
2547        for (src, edges_list) in out_edges.iter().enumerate() {
2548            if edges_list.is_empty() {
2549                continue;
2550            }
2551            let src_rank = rank[src];
2552            let total_w = out_weight[src];
2553            for &(dst, w) in edges_list {
2554                next_rank[dst] += DAMPING * src_rank * (w / total_w);
2555            }
2556        }
2557
2558        // Check convergence
2559        let diff: f32 = rank
2560            .iter()
2561            .zip(next_rank.iter())
2562            .map(|(a, b)| (a - b).abs())
2563            .sum();
2564
2565        std::mem::swap(&mut rank, &mut next_rank);
2566
2567        if diff < EPSILON {
2568            break;
2569        }
2570    }
2571
2572    rank
2573}
2574
2575// ── Graph Building ───────────────────────────────────────────────────
2576
2577/// Intermediate result from definition-level graph computation.
2578struct DefGraphData {
2579    def_edges: Vec<(DefId, DefId, u32)>,
2580    def_ranks: Vec<f32>,
2581    def_callers: Vec<Vec<DefId>>,
2582    def_callees: Vec<Vec<DefId>>,
2583    offsets: Vec<usize>,
2584    base_ranks: Vec<f32>,
2585    file_edges: Vec<(u32, u32, u32)>,
2586}
2587
2588/// Build bidirectional trait↔impl method edges for PageRank propagation (G3).
2589///
2590/// For every impl method that overrides a trait method, adds:
2591/// - `(impl_def_id, trait_def_id, 1)` — impl → trait
2592/// - `(trait_def_id, impl_def_id, 1)` — trait → impl
2593///
2594/// Detection heuristic: an impl method "overrides" a trait method when:
2595/// - The impl method's kind is `"function_item"` and its `scope` starts with
2596///   `"impl_item"`.
2597/// - The trait method's kind is `"function_signature_item"` and its `scope`
2598///   starts with `"trait_item"`.
2599/// - Both have the same `name`.
2600/// - The impl's file imports the trait's file (or they share a file).
2601///
2602/// This is heuristic, not sound: it may produce false positives when two
2603/// unrelated traits define methods with the same name. The practical false-
2604/// positive rate on real Rust codebases is low because method names are
2605/// usually unique within a crate.
2606#[must_use]
2607pub fn build_trait_impl_edges_pub(files: &[FileNode]) -> Vec<(DefId, DefId, u32)> {
2608    build_trait_impl_edges(files)
2609}
2610
2611fn build_trait_impl_edges(files: &[FileNode]) -> Vec<(DefId, DefId, u32)> {
2612    // Build index: method_name → list of (DefId, is_trait_method).
2613    // trait method: kind == "function_signature_item" (abstract) OR scope contains "trait_item".
2614    // impl method:  kind == "function_item" AND scope contains "impl_item".
2615    let mut trait_methods: HashMap<String, Vec<DefId>> = HashMap::new();
2616    let mut impl_methods: HashMap<String, Vec<DefId>> = HashMap::new();
2617
2618    for (fi, file) in files.iter().enumerate() {
2619        for (di, def) in file.defs.iter().enumerate() {
2620            #[expect(clippy::cast_possible_truncation)]
2621            let did: DefId = (fi as u32, di as u16);
2622            if def.kind == "function_signature_item"
2623                || (def.scope.starts_with("trait_item") && def.kind == "function_item")
2624            {
2625                trait_methods.entry(def.name.clone()).or_default().push(did);
2626            } else if def.kind == "function_item" && def.scope.starts_with("impl_item") {
2627                impl_methods.entry(def.name.clone()).or_default().push(did);
2628            }
2629        }
2630    }
2631
2632    // Pre-build imported-files sets to restrict matching.
2633    let imported_sets: Vec<std::collections::HashSet<u32>> = files
2634        .iter()
2635        .map(|f| {
2636            f.imports
2637                .iter()
2638                .filter_map(|imp| imp.resolved_idx)
2639                .collect()
2640        })
2641        .collect();
2642
2643    let mut edges: Vec<(DefId, DefId, u32)> = Vec::new();
2644
2645    for (name, trait_defs) in &trait_methods {
2646        let Some(impl_defs) = impl_methods.get(name) else {
2647            continue;
2648        };
2649        for &(tf, td) in trait_defs {
2650            for &(imf, imd) in impl_defs {
2651                // The impl file must import the trait file (or be the same file).
2652                let connected = tf == imf
2653                    || imported_sets
2654                        .get(imf as usize)
2655                        .is_some_and(|s| s.contains(&tf));
2656                if connected {
2657                    let trait_id: DefId = (tf, td);
2658                    let impl_id: DefId = (imf, imd);
2659                    edges.push((trait_id, impl_id, 1));
2660                    edges.push((impl_id, trait_id, 1));
2661                }
2662            }
2663        }
2664    }
2665
2666    edges
2667}
2668
2669/// Build definition-level edges, compute `PageRank`, and derive file-level data.
2670fn compute_def_graph(files: &[FileNode]) -> DefGraphData {
2671    // Build definition-level edge list from resolved calls
2672    let mut def_edge_map: HashMap<(DefId, DefId), u32> = HashMap::new();
2673    for (file_idx, file) in files.iter().enumerate() {
2674        for (def_idx, def) in file.defs.iter().enumerate() {
2675            #[expect(clippy::cast_possible_truncation)]
2676            let caller_id: DefId = (file_idx as u32, def_idx as u16);
2677            for call in &def.calls {
2678                if let Some(callee_id) = call.resolved {
2679                    *def_edge_map.entry((caller_id, callee_id)).or_insert(0) += 1;
2680                }
2681            }
2682        }
2683    }
2684
2685    // Add trait↔impl bidirectional edges (G3).
2686    let trait_impl_edges = build_trait_impl_edges(files);
2687    for (src, dst, w) in trait_impl_edges {
2688        *def_edge_map.entry((src, dst)).or_insert(0) += w;
2689    }
2690
2691    let def_edges: Vec<(DefId, DefId, u32)> = def_edge_map
2692        .into_iter()
2693        .map(|((src, dst), w)| (src, dst, w))
2694        .collect();
2695
2696    // Compute def-level PageRank
2697    let offsets = def_offsets(files);
2698    let n_defs = *offsets.last().unwrap_or(&0);
2699
2700    let flat_def_edges: Vec<(u32, u32, u32)> = def_edges
2701        .iter()
2702        .map(|(src, dst, w)| {
2703            #[expect(clippy::cast_possible_truncation)]
2704            (
2705                flatten_def_id(&offsets, *src) as u32,
2706                flatten_def_id(&offsets, *dst) as u32,
2707                *w,
2708            )
2709        })
2710        .collect();
2711
2712    let def_ranks = pagerank(n_defs, &flat_def_edges, None);
2713
2714    // Aggregate def ranks to file level
2715    let base_ranks: Vec<f32> = files
2716        .iter()
2717        .enumerate()
2718        .map(|(i, file)| {
2719            let start = offsets[i];
2720            let end = start + file.defs.len();
2721            def_ranks[start..end].iter().sum()
2722        })
2723        .collect();
2724
2725    // Derive file-level edges from def-level call edges
2726    let mut file_edge_map: HashMap<(u32, u32), u32> = HashMap::new();
2727    for &(src, dst, w) in &def_edges {
2728        let src_file = src.0;
2729        let dst_file = dst.0;
2730        if src_file != dst_file {
2731            *file_edge_map.entry((src_file, dst_file)).or_insert(0) += w;
2732        }
2733    }
2734    let file_edges: Vec<(u32, u32, u32)> = file_edge_map
2735        .into_iter()
2736        .map(|((src, dst), w)| (src, dst, w))
2737        .collect();
2738
2739    // Build def-level caller/callee lists
2740    let (def_callers, def_callees) = build_def_neighbor_lists(n_defs, &flat_def_edges, &offsets);
2741
2742    DefGraphData {
2743        def_edges,
2744        def_ranks,
2745        def_callers,
2746        def_callees,
2747        offsets,
2748        base_ranks,
2749        file_edges,
2750    }
2751}
2752
2753/// Build a dependency graph from a repository root.
2754///
2755/// Walks the directory tree, parses each supported file with tree-sitter,
2756/// extracts definitions and imports, resolves import paths to files, runs
2757/// `PageRank`, and builds caller/callee lists.
2758///
2759/// # Errors
2760///
2761/// Returns an error if file walking or reading fails.
2762#[expect(
2763    clippy::too_many_lines,
2764    reason = "three-phase parallel pipeline (walk+filter, def+import extraction, call extraction) \
2765              plus resolve + graph build; phases share state (file_index, raw_sources) and \
2766              cannot be meaningfully split without passing large mutable structures across \
2767              boundaries with no clarity gain"
2768)]
2769pub fn build_graph(root: &Path) -> crate::Result<RepoGraph> {
2770    let root = root.canonicalize().map_err(|e| crate::Error::Io {
2771        path: root.display().to_string(),
2772        source: e,
2773    })?;
2774
2775    let mut walk_options = walk::WalkOptions::default();
2776    if let Some((_, config)) = crate::cache::config::find_config(&root) {
2777        walk_options.ignore_patterns = config.ignore.patterns;
2778    }
2779    let all_files = walk::collect_files_with_options(&root, &walk_options);
2780
2781    // Phase 1: parallel filter + read. For each candidate path with a
2782    // supported extension, read its source from disk and emit a tuple
2783    // alongside its relative path. rayon spreads the I/O cost across
2784    // worker threads; on a 1M-file corpus this was ~20s sequential and
2785    // now sits in the 2-3s range bounded by disk + filter throughput.
2786    let raw_inputs: Vec<(PathBuf, String, String, String)> = all_files
2787        .par_iter()
2788        .filter_map(|path| {
2789            let ext = path
2790                .extension()
2791                .and_then(|e| e.to_str())
2792                .unwrap_or_default()
2793                .to_string();
2794            if languages::config_for_extension(&ext).is_none()
2795                && import_query_for_extension(&ext).is_none()
2796            {
2797                return None;
2798            }
2799            let source = std::fs::read_to_string(path).ok()?;
2800            let rel_path = path
2801                .strip_prefix(&root)
2802                .unwrap_or(path)
2803                .display()
2804                .to_string();
2805            Some((path.clone(), rel_path, ext, source))
2806        })
2807        .collect();
2808
2809    // Build the contiguous `files` Vec and the absolute-path -> idx
2810    // lookup. Sequential because both want stable indices that match
2811    // `raw_sources`'s order; the per-file work this gates is trivial.
2812    let mut file_index: HashMap<PathBuf, usize> = HashMap::with_capacity(raw_inputs.len());
2813    let mut files: Vec<FileNode> = Vec::with_capacity(raw_inputs.len());
2814    let mut raw_sources: Vec<(usize, String, String)> = Vec::with_capacity(raw_inputs.len());
2815    for (idx, (abs_path, rel_path, ext, source)) in raw_inputs.into_iter().enumerate() {
2816        file_index.insert(abs_path, idx);
2817        files.push(FileNode {
2818            path: rel_path,
2819            defs: vec![],
2820            imports: vec![],
2821        });
2822        raw_sources.push((idx, ext, source));
2823    }
2824
2825    // Phase 2: parallel per-file definition + import extraction. Each
2826    // file's tree-sitter parse + def/import queries are independent;
2827    // par_iter_mut over files.iter_mut().zip(raw_sources.par_iter())
2828    // lets every rayon worker grind its own slice. The closures here
2829    // borrow `&root` and `&file_index` immutably (both Sync) and write
2830    // disjoint `FileNode` slots via the &mut iterator.
2831    files
2832        .par_iter_mut()
2833        .zip(raw_sources.par_iter())
2834        .for_each(|(file, (_, ext, source))| {
2835            if let Some(config) = languages::config_for_extension(ext) {
2836                file.defs = extract_definitions(source, &config);
2837                // Go method_declaration scopes are empty after the generic
2838                // extract_definitions pass (the method is a top-level node
2839                // with no structural parent in CONTAINER_KINDS). Enrich them
2840                // with the receiver type so that resolve_calls Priority 2
2841                // (scope.contains(recv_type)) fires correctly for cross-file
2842                // Go receiver-method calls. This populates def_callers[] for
2843                // Go in compute_def_graph (P1 fix).
2844                if languages::is_go_language(&config.language) {
2845                    enrich_go_method_def_scopes(source, &mut file.defs);
2846                }
2847                // SQL: prepend a synthetic file-level def named after the
2848                // filename stem (dbt/sqlmesh convention). The whole-file
2849                // byte range becomes the smallest-enclosing fallback for
2850                // FROM/JOIN call-edges that are not inside any CTE, which
2851                // is the resolution target for cross-model references
2852                // (S1, Wave 4). file.path is relative to the repo root and
2853                // is what file_stem() needs to derive the model name.
2854                if languages::is_sql_language(&config.language) {
2855                    enrich_sql_file_def(&file.path, source, &mut file.defs);
2856                }
2857            }
2858            if let Some((lang, import_query)) = import_query_for_extension(ext) {
2859                let raw_imports = extract_imports(source, &lang, &import_query);
2860                let file_path = root.join(&file.path);
2861                file.imports = raw_imports
2862                    .into_iter()
2863                    .map(|raw| {
2864                        let resolved_idx =
2865                            resolve_import(&raw, ext, &file_path, &root, &file_index)
2866                                .and_then(|i| u32::try_from(i).ok());
2867                        ImportRef {
2868                            raw_path: raw,
2869                            resolved_idx,
2870                        }
2871                    })
2872                    .collect();
2873            }
2874        });
2875
2876    // Phase 3: parallel per-file call extraction. Mutates each
2877    // FileNode's `defs[*].calls` independently. Aligned with
2878    // raw_sources by index via the zip.
2879    files
2880        .par_iter_mut()
2881        .zip(raw_sources.par_iter())
2882        .for_each(|(file, (_, ext, source))| {
2883            if let Some(call_config) = languages::call_query_for_extension(ext) {
2884                extract_calls(source, &call_config, &mut file.defs);
2885            }
2886        });
2887
2888    // Build the Python class hierarchy (class_name → parent class names) by
2889    // walking every Python source file. The map is used by `resolve_calls`
2890    // Priority 2.5 to dispatch `self.method()` calls through the MRO when
2891    // the method lives on a parent / mixin class (Q1, Wave 2).
2892    //
2893    // Parallel: extract_python_class_hierarchy is pure per-file, then we
2894    // fold the per-file maps into one global map sequentially because
2895    // HashMap is not lock-free. On a 1k-Python-file corpus this fold takes
2896    // <10ms — much smaller than the parallel parse work that feeds it.
2897    let python_hierarchies: Vec<HashMap<String, Vec<String>>> = raw_sources
2898        .par_iter()
2899        .map(|(_, ext, source)| {
2900            if ext == "py" || ext == "pyi" {
2901                extract_python_class_hierarchy(source)
2902            } else {
2903                HashMap::new()
2904            }
2905        })
2906        .collect();
2907    let mut python_class_hierarchy: HashMap<String, Vec<String>> = HashMap::new();
2908    for local in python_hierarchies {
2909        for (k, v) in local {
2910            // First declaration wins on name collisions across files. The
2911            // MRO walk only needs a plausible parent chain to find an
2912            // ancestor's methods; this is conservative but acceptable.
2913            python_class_hierarchy.entry(k).or_insert(v);
2914        }
2915    }
2916
2917    // Resolve call references to target definitions
2918    let def_index = build_def_index(&files);
2919    resolve_calls(&mut files, &def_index, &python_class_hierarchy);
2920
2921    // Build def-level graph, compute PageRank, and derive file-level data
2922    let graph_data = compute_def_graph(&files);
2923
2924    // Build file-level caller/callee lists
2925    let n = files.len();
2926    let (callers, callees) = build_neighbor_lists(n, &graph_data.file_edges);
2927
2928    // Auto-tune alpha based on graph density
2929    #[expect(clippy::cast_precision_loss, reason = "graph sizes fit in f32")]
2930    let density = if n > 1 {
2931        graph_data.file_edges.len() as f32 / (n as f32 * (n as f32 - 1.0))
2932    } else {
2933        0.0
2934    };
2935    let alpha = 0.3f32.mul_add(density.min(1.0), 0.5);
2936
2937    Ok(RepoGraph {
2938        files,
2939        edges: graph_data.file_edges,
2940        base_ranks: graph_data.base_ranks,
2941        callers,
2942        callees,
2943        def_edges: graph_data.def_edges,
2944        def_ranks: graph_data.def_ranks,
2945        def_callers: graph_data.def_callers,
2946        def_callees: graph_data.def_callees,
2947        def_offsets: graph_data.offsets,
2948        alpha,
2949    })
2950}
2951
2952/// Build a `RepoGraph` directly from a pre-constructed `Vec<FileNode>`.
2953///
2954/// Skips the filesystem walk phase of [`build_graph`]; useful for integration
2955/// tests that want to build synthetic graphs without touching disk.
2956///
2957/// Resolves calls, builds the def-level graph (including G3 trait↔impl edges),
2958/// computes `PageRank`, and builds caller/callee lists.
2959#[must_use]
2960pub fn build_graph_from_files_pub(mut files: Vec<FileNode>) -> RepoGraph {
2961    let def_index = build_def_index(&files);
2962    // No source is available at this entry point, so the Python class
2963    // hierarchy is empty and the MRO walk (Priority 2.5) is a no-op.
2964    // Tests that need MRO resolution should drive `resolve_calls_with_python_mro_pub`
2965    // directly and then call `build_graph_from_files_pub` for the rest of the pipeline.
2966    let empty_hierarchy: HashMap<String, Vec<String>> = HashMap::new();
2967    resolve_calls(&mut files, &def_index, &empty_hierarchy);
2968    let graph_data = compute_def_graph(&files);
2969    let n = files.len();
2970    let (callers, callees) = build_neighbor_lists(n, &graph_data.file_edges);
2971
2972    #[expect(clippy::cast_precision_loss, reason = "graph sizes fit in f32")]
2973    let density = if n > 1 {
2974        graph_data.file_edges.len() as f32 / (n as f32 * (n as f32 - 1.0))
2975    } else {
2976        0.0
2977    };
2978    let alpha = 0.3f32.mul_add(density.min(1.0), 0.5);
2979
2980    RepoGraph {
2981        files,
2982        edges: graph_data.file_edges,
2983        base_ranks: graph_data.base_ranks,
2984        callers,
2985        callees,
2986        def_edges: graph_data.def_edges,
2987        def_ranks: graph_data.def_ranks,
2988        def_callers: graph_data.def_callers,
2989        def_callees: graph_data.def_callees,
2990        def_offsets: graph_data.offsets,
2991        alpha,
2992    }
2993}
2994
2995// ── Dead-code analysis ───────────────────────────────────────────────────────
2996
2997/// Global flat definition index: `def_offsets[file_idx] + def_idx_within_file`.
2998///
2999/// This is the natural index into [`RepoGraph::def_ranks`],
3000/// [`RepoGraph::def_callers`], and [`RepoGraph::def_callees`].
3001pub type DefIndex = usize;
3002
3003/// A connected component in the dead-code subgraph.
3004///
3005/// All members are definitions that are unreachable from any entry point.
3006/// The component is formed by treating `def_callees + def_callers` edges as
3007/// undirected — so a transitively-dead group (including mutual-recursion
3008/// cycles that are collectively unreachable) surfaces as one cluster.
3009#[derive(Debug, Clone)]
3010pub struct DeadCluster {
3011    /// Global flat def index of the cluster root (the member with the
3012    /// highest [`RepoGraph::def_ranks`] score).
3013    pub root_def_idx: usize,
3014    /// Number of definitions in this cluster.
3015    pub size: usize,
3016    /// Sum of `end_line - start_line` for every member definition.
3017    pub total_lines: usize,
3018    /// All member global def indices, root first.
3019    pub member_def_indices: Vec<usize>,
3020}
3021
3022/// Summary report from [`compute_dead_code`].
3023///
3024/// The primary consumer is X3's `mcp__ripvec__find_dead_code` MCP tool.
3025#[derive(Debug, Clone)]
3026pub struct DeadCodeReport {
3027    /// Dead clusters sorted by size descending (largest first).
3028    pub dead_clusters: Vec<DeadCluster>,
3029    /// Total number of definitions unreachable from any entry point.
3030    pub total_dead_defs: usize,
3031    /// Total number of definitions reachable from at least one entry point.
3032    pub total_live_defs: usize,
3033    /// Fraction of all definitions that are dead: `dead / (dead + live)`.
3034    pub dead_fraction: f32,
3035}
3036
3037/// Returns true if the given path is a test-related path.
3038///
3039/// Heuristic: the path contains `tests/`, `/spec/`, `/specs/`, or the
3040/// file stem starts/ends with `test_`/`_test` or contains `bench`.
3041fn is_test_path(path: &str) -> bool {
3042    let path_lc = path.to_lowercase();
3043    if path_lc.contains("tests/") || path_lc.contains("/spec/") || path_lc.contains("/specs/") {
3044        return true;
3045    }
3046    let file_name = path.rsplit('/').next().unwrap_or(path);
3047    let stem = file_name.split('.').next().unwrap_or(file_name);
3048    stem.starts_with("test_") || stem.ends_with("_test") || stem.contains("bench")
3049}
3050
3051/// Resolve a flat [`DefIndex`] to its file index using the `def_offsets`
3052/// prefix-sum table.
3053fn flat_to_file_idx(offsets: &[usize], flat: DefIndex) -> usize {
3054    offsets.partition_point(|&o| o <= flat).saturating_sub(1)
3055}
3056
3057/// Union-find: path-compressing find. Returns the root representative of `x`.
3058fn uf_find(parent: &mut Vec<usize>, x: usize) -> usize {
3059    if parent[x] != x {
3060        parent[x] = uf_find(parent, parent[x]);
3061    }
3062    parent[x]
3063}
3064
3065/// Union-find: merge the components containing `x` and `y`.
3066fn uf_union(parent: &mut Vec<usize>, x: usize, y: usize) {
3067    let rx = uf_find(parent, x);
3068    let ry = uf_find(parent, y);
3069    if rx != ry {
3070        parent[rx] = ry;
3071    }
3072}
3073
3074/// Compute the set of definitions unreachable from any entry point.
3075///
3076/// Returns dead clusters (connected components in the unreachable subgraph),
3077/// sorted by size descending.
3078///
3079/// # Algorithm
3080///
3081/// 1. Optionally filter test paths from `entry_def_indices` (when
3082///    `include_test_paths` is `false`). Test-path heuristic: the file path
3083///    contains `test_`, `_test`, `tests/`, `spec/`, `specs/`, or `bench`.
3084/// 2. BFS forward over [`RepoGraph::def_callees`] from the entry seeds ->
3085///    reachable set.
3086/// 3. Complement `(all_defs - reachable)` = dead set.
3087/// 4. Connected-components on the dead subgraph via
3088///    `def_callees + def_callers` treated as undirected (union-find).
3089/// 5. For each cluster: pick the highest-rank def as the cluster root;
3090///    `size` = member count; `total_lines` = sum of `(end_line -
3091///    start_line)` per member.
3092/// 6. Sort clusters by size descending.
3093#[must_use]
3094#[expect(
3095    clippy::too_many_lines,
3096    reason = "six-step BFS+clustering pipeline; splitting into sub-functions \
3097              would require passing many interdependent Vec borrows with no \
3098              clarity gain"
3099)]
3100pub fn compute_dead_code<S: std::hash::BuildHasher>(
3101    graph: &RepoGraph,
3102    entry_def_indices: &HashSet<DefIndex, S>,
3103    include_test_paths: bool,
3104) -> DeadCodeReport {
3105    let n_defs = graph.def_ranks.len();
3106    if n_defs == 0 {
3107        return DeadCodeReport {
3108            dead_clusters: vec![],
3109            total_dead_defs: 0,
3110            total_live_defs: 0,
3111            dead_fraction: 0.0,
3112        };
3113    }
3114
3115    // Step 1: build the effective seed set.
3116    let seeds: Vec<DefIndex> = if include_test_paths {
3117        entry_def_indices.iter().copied().collect()
3118    } else {
3119        entry_def_indices
3120            .iter()
3121            .copied()
3122            .filter(|&flat| {
3123                let file_idx = flat_to_file_idx(&graph.def_offsets, flat);
3124                let path = graph
3125                    .files
3126                    .get(file_idx)
3127                    .map(|f| f.path.as_str())
3128                    .unwrap_or("");
3129                !is_test_path(path)
3130            })
3131            .collect()
3132    };
3133
3134    // Step 2: BFS forward over def_callees from seeds -> reachable set.
3135    let mut reachable: Vec<bool> = vec![false; n_defs];
3136    let mut queue: std::collections::VecDeque<DefIndex> = std::collections::VecDeque::new();
3137
3138    for seed in &seeds {
3139        if *seed < n_defs && !reachable[*seed] {
3140            reachable[*seed] = true;
3141            queue.push_back(*seed);
3142        }
3143    }
3144
3145    while let Some(flat) = queue.pop_front() {
3146        for callee_did in &graph.def_callees[flat] {
3147            let callee_flat = graph.def_offsets[callee_did.0 as usize] + callee_did.1 as usize;
3148            if callee_flat < n_defs && !reachable[callee_flat] {
3149                reachable[callee_flat] = true;
3150                queue.push_back(callee_flat);
3151            }
3152        }
3153    }
3154
3155    // Step 3: dead set = all_defs - reachable.
3156    let dead: Vec<DefIndex> = (0..n_defs).filter(|&i| !reachable[i]).collect();
3157    let total_dead_defs = dead.len();
3158    let total_live_defs = n_defs - total_dead_defs;
3159
3160    if dead.is_empty() {
3161        return DeadCodeReport {
3162            dead_clusters: vec![],
3163            total_dead_defs: 0,
3164            total_live_defs,
3165            dead_fraction: 0.0,
3166        };
3167    }
3168
3169    // Step 4: connected components on the dead subgraph via union-find.
3170    let dead_set: HashSet<DefIndex> = dead.iter().copied().collect();
3171    let dead_pos: HashMap<DefIndex, usize> = dead
3172        .iter()
3173        .copied()
3174        .enumerate()
3175        .map(|(pos, idx)| (idx, pos))
3176        .collect();
3177    let m = dead.len();
3178    let mut parent: Vec<usize> = (0..m).collect();
3179
3180    for &flat in &dead {
3181        let pos_flat = dead_pos[&flat];
3182        for callee_did in &graph.def_callees[flat] {
3183            let callee_flat = graph.def_offsets[callee_did.0 as usize] + callee_did.1 as usize;
3184            if dead_set.contains(&callee_flat) {
3185                let pos_callee = dead_pos[&callee_flat];
3186                uf_union(&mut parent, pos_flat, pos_callee);
3187            }
3188        }
3189        for caller_did in &graph.def_callers[flat] {
3190            let caller_flat = graph.def_offsets[caller_did.0 as usize] + caller_did.1 as usize;
3191            if dead_set.contains(&caller_flat) {
3192                let pos_caller = dead_pos[&caller_flat];
3193                uf_union(&mut parent, pos_flat, pos_caller);
3194            }
3195        }
3196    }
3197
3198    // Flatten roots (path compression for all).
3199    for i in 0..m {
3200        uf_find(&mut parent, i);
3201    }
3202
3203    // Group members by their component root.
3204    let mut components: HashMap<usize, Vec<DefIndex>> = HashMap::new();
3205    for (pos, &flat) in dead.iter().enumerate() {
3206        let root_pos = parent[pos];
3207        components.entry(root_pos).or_default().push(flat);
3208    }
3209
3210    // Step 5: build clusters - root is the highest-rank member.
3211    let mut clusters: Vec<DeadCluster> = components
3212        .into_values()
3213        .map(|members| {
3214            let root_flat = members
3215                .iter()
3216                .copied()
3217                .max_by(|&a, &b| {
3218                    let ra = graph.def_ranks.get(a).copied().unwrap_or(0.0);
3219                    let rb = graph.def_ranks.get(b).copied().unwrap_or(0.0);
3220                    ra.total_cmp(&rb)
3221                })
3222                .unwrap_or(members[0]);
3223
3224            let total_lines: usize = members
3225                .iter()
3226                .copied()
3227                .map(|flat| {
3228                    let file_idx = flat_to_file_idx(&graph.def_offsets, flat);
3229                    let def_idx = flat - graph.def_offsets[file_idx];
3230                    let def = graph.files.get(file_idx).and_then(|f| f.defs.get(def_idx));
3231                    def.map(|d| (d.end_line as usize).saturating_sub(d.start_line as usize))
3232                        .unwrap_or(0)
3233                })
3234                .sum();
3235
3236            let mut member_def_indices: Vec<usize> = std::iter::once(root_flat)
3237                .chain(members.iter().copied().filter(|&m| m != root_flat))
3238                .collect();
3239            if member_def_indices.len() > 1 {
3240                member_def_indices[1..].sort_unstable();
3241            }
3242
3243            DeadCluster {
3244                root_def_idx: root_flat,
3245                size: member_def_indices.len(),
3246                total_lines,
3247                member_def_indices,
3248            }
3249        })
3250        .collect();
3251
3252    // Step 6: sort by size descending.
3253    clusters.sort_by(|a, b| {
3254        b.size
3255            .cmp(&a.size)
3256            .then(b.root_def_idx.cmp(&a.root_def_idx))
3257    });
3258
3259    #[expect(
3260        clippy::cast_precision_loss,
3261        reason = "def counts fit comfortably in f32"
3262    )]
3263    let dead_fraction = if n_defs > 0 {
3264        total_dead_defs as f32 / n_defs as f32
3265    } else {
3266        0.0
3267    };
3268
3269    DeadCodeReport {
3270        dead_clusters: clusters,
3271        total_dead_defs,
3272        total_live_defs,
3273        dead_fraction,
3274    }
3275}
3276
3277impl RepoGraph {
3278    /// Get the `PageRank` score for a specific definition.
3279    #[must_use]
3280    pub fn def_rank(&self, did: DefId) -> f32 {
3281        let flat = self.def_offsets[did.0 as usize] + did.1 as usize;
3282        self.def_ranks.get(flat).copied().unwrap_or(0.0)
3283    }
3284
3285    /// Look up a definition by file path and name. Returns the first match.
3286    #[must_use]
3287    pub fn find_def(&self, file_path: &str, def_name: &str) -> Option<DefId> {
3288        for (file_idx, file) in self.files.iter().enumerate() {
3289            if file.path == file_path {
3290                for (def_idx, def) in file.defs.iter().enumerate() {
3291                    if def.name == def_name {
3292                        #[expect(clippy::cast_possible_truncation)]
3293                        return Some((file_idx as u32, def_idx as u16));
3294                    }
3295                }
3296            }
3297        }
3298        None
3299    }
3300
3301    /// Resolve a caller-supplied `focus_file` string to a file index in [`Self::files`].
3302    ///
3303    /// Accepts any of the path forms that ripvec itself emits or accepts:
3304    ///
3305    /// - **Exact stored path** (`device_opt/services/storage.py`) — direct match.
3306    /// - **LSP-shaped path** (`./device_opt/services/storage.py`) — the `./`
3307    ///   prefix used by every [`RepoMapLspLocation::file_path`] is stripped
3308    ///   before comparison so the documented chaining pattern
3309    ///   `get_repo_map(focus_file=hits[0].lsp_location.file_path)` works.
3310    /// - **Strict suffix** (`storage.py`, `services/storage.py`) — match when
3311    ///   the previous character in the stored path is `/`. Avoids matching
3312    ///   `foo_storage.py` for `storage.py`.
3313    ///
3314    /// Returns [`FocusResolution::Found`] when exactly one file matches,
3315    /// [`FocusResolution::Ambiguous`] when multiple files match (the caller
3316    /// surfaces the candidate list to the user), and [`FocusResolution::NotFound`]
3317    /// when no file matches.
3318    ///
3319    /// # Background
3320    ///
3321    /// Prior to this helper the MCP layer (`crates/ripvec-mcp/src/tools.rs`)
3322    /// did the matching inline with two bugs:
3323    ///
3324    /// 1. **`./` prefix mismatch.** [`RepoMapLspLocation::file_path`] always
3325    ///    carries a leading `./` (see [`file_lsp_location`]), but
3326    ///    [`FileNode::path`] does not. Passing the LSP location verbatim as
3327    ///    `focus_file` matched zero files. The matcher silently returned
3328    ///    `focus = None`, producing rank values bit-identical to the unfocused
3329    ///    call — the bug originally reported as "I#20 focus_file rebias
3330    ///    invisible on Python".
3331    /// 2. **Equal-length false negative.** When the user passed
3332    ///    `./device_opt/services/storage.py` and the stored path was
3333    ///    `device_opt/services/storage.py`, `exact` was false (the strings
3334    ///    differ by two bytes) and `strict_suffix` was false (the focus is
3335    ///    longer than the stored path, so `p.len() > focus.len()` fails). The
3336    ///    pathology surfaced specifically when the focus was a *full* path
3337    ///    with the LSP `./` prefix.
3338    ///
3339    /// Centralising the resolution here gives every caller the same
3340    /// normalization-tolerant semantics and one place to test the contract.
3341    #[must_use]
3342    pub fn resolve_focus_file(&self, focus: &str) -> FocusResolution {
3343        let normalized = normalize_focus_path(focus);
3344        let matches: Vec<usize> = self
3345            .files
3346            .iter()
3347            .enumerate()
3348            .filter_map(|(idx, f)| {
3349                if focus_matches_path(&f.path, normalized) {
3350                    Some(idx)
3351                } else {
3352                    None
3353                }
3354            })
3355            .collect();
3356        match matches.len() {
3357            0 => FocusResolution::NotFound,
3358            1 => FocusResolution::Found(matches[0]),
3359            _ => FocusResolution::Ambiguous(
3360                matches
3361                    .into_iter()
3362                    .map(|i| self.files[i].path.clone())
3363                    .collect(),
3364            ),
3365        }
3366    }
3367}
3368
3369/// Result of resolving a user-supplied `focus_file` string against a [`RepoGraph`].
3370///
3371/// See [`RepoGraph::resolve_focus_file`] for the resolution semantics and the
3372/// historical bug that motivated the helper.
3373#[derive(Debug, Clone)]
3374pub enum FocusResolution {
3375    /// Exactly one file matched. Carries the file index in [`RepoGraph::files`].
3376    Found(usize),
3377    /// No file matched. The caller treats this as an unfocused call.
3378    NotFound,
3379    /// Two or more files matched. The caller surfaces the candidate list so
3380    /// the user can disambiguate by passing a longer suffix or the full path.
3381    Ambiguous(Vec<String>),
3382}
3383
3384/// Strip the leading `./` prefix from a focus_file path.
3385///
3386/// The `./` form is produced by [`file_lsp_location`] for every
3387/// [`RepoMapLspLocation::file_path`] field on a relative path. Stripping it
3388/// gives a stored-path-shaped value for the suffix matcher to compare
3389/// against [`FileNode::path`] entries (which do not carry the prefix).
3390///
3391/// Absolute paths (`/abs/path/file.py`) are returned unchanged; they will
3392/// fail the suffix match against the relative stored paths, which is the
3393/// correct behavior (the caller meant a different root entirely).
3394fn normalize_focus_path(focus: &str) -> &str {
3395    focus.strip_prefix("./").unwrap_or(focus)
3396}
3397
3398/// Return true when `focus` matches `stored_path` as either an exact path or
3399/// a strict-suffix (must be preceded by `/`). The empty focus does not match.
3400fn focus_matches_path(stored_path: &str, focus: &str) -> bool {
3401    if focus.is_empty() {
3402        return false;
3403    }
3404    if stored_path == focus {
3405        return true;
3406    }
3407    stored_path.len() > focus.len()
3408        && stored_path.ends_with(focus)
3409        && stored_path.as_bytes()[stored_path.len() - focus.len() - 1] == b'/'
3410}
3411
3412/// Build top-N caller and callee lists for each file.
3413///
3414/// Given a list of weighted directed edges `(src, dst, weight)` over `n`
3415/// nodes, returns `(callers[i], callees[i])` for each node `i`, where each
3416/// list contains the top-[`MAX_NEIGHBORS`] adjacent nodes sorted by descending
3417/// edge weight.
3418///
3419/// Exposed as `pub` so that integration tests can construct synthetic
3420/// [`RepoGraph`] instances for unit-testing the JSON rendering without going
3421/// through a full disk walk.
3422#[must_use]
3423pub fn build_neighbor_lists(n: usize, edges: &[(u32, u32, u32)]) -> (Vec<Vec<u32>>, Vec<Vec<u32>>) {
3424    let mut incoming: Vec<Vec<(u32, u32)>> = vec![vec![]; n];
3425    let mut outgoing: Vec<Vec<(u32, u32)>> = vec![vec![]; n];
3426
3427    for &(src, dst, w) in edges {
3428        let (s, d) = (src as usize, dst as usize);
3429        if s < n && d < n {
3430            incoming[d].push((src, w));
3431            outgoing[s].push((dst, w));
3432        }
3433    }
3434
3435    // Sort by weight descending, keep top N
3436    let trim = |lists: &mut [Vec<(u32, u32)>]| -> Vec<Vec<u32>> {
3437        lists
3438            .iter_mut()
3439            .map(|list| {
3440                list.sort_by_key(|b| std::cmp::Reverse(b.1));
3441                list.iter()
3442                    .take(MAX_NEIGHBORS)
3443                    .map(|(idx, _)| *idx)
3444                    .collect()
3445            })
3446            .collect()
3447    };
3448
3449    (trim(&mut incoming), trim(&mut outgoing))
3450}
3451
3452// ── Rendering ────────────────────────────────────────────────────────
3453
3454/// Render a budget-constrained overview of the repository.
3455///
3456/// Files are sorted by `PageRank` (or topic-sensitive rank if `focus` is
3457/// `Some`). Output uses four tiers of decreasing detail:
3458///
3459/// - **Tier 0** (top 10%): full path, rank, callers/callees, signatures with scopes
3460/// - **Tier 1** (next 20%): full path, rank, signatures
3461/// - **Tier 2** (next 40%): full path, rank, definition names and kinds
3462/// - **Tier 3** (bottom 30%): file path only
3463///
3464/// Stops accumulating output when the estimated token count exceeds
3465/// `max_tokens`.
3466#[must_use]
3467pub fn render(graph: &RepoGraph, max_tokens: usize, focus: Option<usize>) -> String {
3468    let n = graph.files.len();
3469    if n == 0 {
3470        return String::new();
3471    }
3472
3473    // Compute ranks (recompute topic-sensitive if focus is given)
3474    let ranks = if focus.is_some() {
3475        pagerank(n, &graph.edges, focus)
3476    } else {
3477        graph.base_ranks.clone()
3478    };
3479
3480    // Sort file indices by rank descending
3481    let mut sorted: Vec<usize> = (0..n).collect();
3482    sorted.sort_by(|&a, &b| ranks[b].total_cmp(&ranks[a]));
3483
3484    let mut output = String::new();
3485    let mut used_tokens = 0;
3486    let max_chars = max_tokens * CHARS_PER_TOKEN;
3487
3488    for (rank_pos, &file_idx) in sorted.iter().enumerate() {
3489        if used_tokens >= max_tokens {
3490            break;
3491        }
3492
3493        let file = &graph.files[file_idx];
3494        let score = ranks[file_idx];
3495        #[expect(clippy::cast_precision_loss, reason = "file counts fit in f32")]
3496        let percentile = (rank_pos as f32) / (n as f32);
3497
3498        let section = if percentile < 0.1 {
3499            render_tier0(graph, file_idx, file, score)
3500        } else if percentile < 0.3 {
3501            render_tier1(file, score)
3502        } else if percentile < 0.7 {
3503            render_tier2(file, score)
3504        } else {
3505            render_tier3(file)
3506        };
3507
3508        let section_chars = section.len();
3509        if used_tokens > 0 && used_tokens + section_chars / CHARS_PER_TOKEN > max_tokens {
3510            // Would exceed budget — try to fit at least the path
3511            let path_line = format!("{}\n", file.path);
3512            let path_tokens = path_line.len() / CHARS_PER_TOKEN;
3513            if used_tokens + path_tokens <= max_tokens {
3514                output.push_str(&path_line);
3515            }
3516            break;
3517        }
3518
3519        output.push_str(&section);
3520        used_tokens = output.len().min(max_chars) / CHARS_PER_TOKEN;
3521    }
3522
3523    output
3524}
3525
3526/// Render tier 0: full detail with callers, callees, and signatures.
3527fn render_tier0(graph: &RepoGraph, file_idx: usize, file: &FileNode, score: f32) -> String {
3528    let mut out = format!("## {} (rank: {score:.4})\n", file.path);
3529
3530    // Callers
3531    if file_idx < graph.callers.len() && !graph.callers[file_idx].is_empty() {
3532        let _ = write!(out, "  called by: ");
3533        let names: Vec<&str> = graph.callers[file_idx]
3534            .iter()
3535            .filter_map(|&idx| graph.files.get(idx as usize).map(|f| f.path.as_str()))
3536            .collect();
3537        let _ = writeln!(out, "{}", names.join(", "));
3538    }
3539
3540    // Callees
3541    if file_idx < graph.callees.len() && !graph.callees[file_idx].is_empty() {
3542        let _ = write!(out, "  calls: ");
3543        let names: Vec<&str> = graph.callees[file_idx]
3544            .iter()
3545            .filter_map(|&idx| graph.files.get(idx as usize).map(|f| f.path.as_str()))
3546            .collect();
3547        let _ = writeln!(out, "{}", names.join(", "));
3548    }
3549
3550    // Definitions with scope and signature
3551    for def in &file.defs {
3552        let scope_prefix = if def.scope.is_empty() {
3553            String::new()
3554        } else {
3555            format!("{} > ", def.scope)
3556        };
3557        if let Some(sig) = &def.signature {
3558            let _ = writeln!(out, "  {scope_prefix}{} {sig}", def.kind);
3559        } else {
3560            let _ = writeln!(out, "  {scope_prefix}{} {}", def.kind, def.name);
3561        }
3562    }
3563    let _ = writeln!(out);
3564    out
3565}
3566
3567/// Render tier 1: file path, rank, and signatures.
3568fn render_tier1(file: &FileNode, score: f32) -> String {
3569    let mut out = format!("## {} (rank: {score:.4})\n", file.path);
3570    for def in &file.defs {
3571        if let Some(sig) = &def.signature {
3572            let _ = writeln!(out, "  {sig}");
3573        } else {
3574            let _ = writeln!(out, "  {} {}", def.kind, def.name);
3575        }
3576    }
3577    let _ = writeln!(out);
3578    out
3579}
3580
3581/// Render tier 2: file path, rank, and definition names/kinds.
3582fn render_tier2(file: &FileNode, score: f32) -> String {
3583    let mut out = format!("{} (rank: {score:.4})", file.path);
3584    if !file.defs.is_empty() {
3585        let names: Vec<String> = file
3586            .defs
3587            .iter()
3588            .map(|d| format!("{}:{}", d.kind, d.name))
3589            .collect();
3590        let _ = write!(out, " -- {}", names.join(", "));
3591    }
3592    let _ = writeln!(out);
3593    out
3594}
3595
3596/// Render tier 3: file path only.
3597fn render_tier3(file: &FileNode) -> String {
3598    format!("{}\n", file.path)
3599}
3600
3601// ── JSON rendering ───────────────────────────────────────────────────
3602
3603/// Build the `lsp_location` for a file itself (line 0).
3604fn file_lsp_location(path: &str) -> RepoMapLspLocation {
3605    RepoMapLspLocation {
3606        file_path: if path.starts_with("./") || path.starts_with('/') {
3607            path.to_string()
3608        } else {
3609            format!("./{path}")
3610        },
3611        start_line: 0,
3612        start_character: 0,
3613        end_line: 0,
3614        end_character: 0,
3615    }
3616}
3617
3618/// Infer `ContentKind` from a file path's extension.
3619fn content_kind_for_path(path: &str) -> ContentKind {
3620    let ext = std::path::Path::new(path)
3621        .extension()
3622        .and_then(|e| e.to_str())
3623        .unwrap_or("");
3624    ContentKind::from_extension(ext)
3625}
3626
3627/// Minimum byte envelope reserved for each included file.
3628///
3629/// Even a file with zero symbols takes JSON overhead for path, rank, arrays,
3630/// etc. Calibrated against actual serde_json output for an empty `RepoMapFile`:
3631/// `{"lsp_location":{"file_path":"./src/file_N.rs","start_line":0,"start_character":0,`
3632/// `"end_line":0,"end_character":0},"rank":0.1234,"content_kind":"code",`
3633/// `"calls":[],"symbols":[],"truncated_symbols":0,"truncated_calls":0}` ≈ 250 bytes.
3634///
3635/// This floor prevents the budget allocator from giving a file so little space
3636/// that it can emit no envelope at all.
3637const FILE_ENVELOPE_MIN_BYTES: usize = 250;
3638
3639/// Minimum useful payload for an admitted file: envelope plus room for at
3640/// least 2-3 typical-sized symbols. Files whose fair share cannot meet this
3641/// floor are excluded entirely (Fix A, 4.0.2). Without this guard, low-rank
3642/// tail files consume budget on envelopes that contain no symbols or calls,
3643/// crowding out content for the top files.
3644const FILE_MIN_USEFUL_BYTES: usize = 600;
3645
3646/// Fraction of each file's per-file budget reserved for outgoing-call edges
3647/// after the envelope is paid. The remaining (1 - this) fraction goes to
3648/// symbols. Symbol leftover flows into calls; call leftover flows to the
3649/// next file. (Fix C, 4.0.2 — without a reserve, the symbol loop saturates
3650/// the per-file budget and calls always come up empty.)
3651const CALLS_BUDGET_FRACTION: f64 = 0.30;
3652
3653/// Maximum fraction of the total budget that a single file may claim.
3654///
3655/// Without this cap a single very-high-rank file (e.g. `lib.rs`) could
3656/// consume the entire budget, leaving all other files empty.
3657const MAX_FILE_SHARE: f64 = 0.40;
3658
3659/// AST kind priority for orientation-style symbol ordering. Higher = surface
3660/// earlier. Used when def-level PageRank is degenerate (most ranks near zero)
3661/// to fall back on structural signal rather than noise.
3662///
3663/// The intuition: a reader orienting in a codebase wants to see the file's
3664/// *shape* before its *behaviors*. Types declare shape; functions declare
3665/// behavior; fields and constants are internal detail. This ordering matches
3666/// how humans read code top-down. (Fix B, 4.0.2.)
3667fn ast_kind_priority(kind: &str) -> u32 {
3668    match kind {
3669        // Tier 3: shape — what THIS file is
3670        "trait_item" | "interface" | "trait" => 30,
3671        "struct_item" | "struct" | "class_definition" | "class" => 29,
3672        "enum_item" | "enum" => 28,
3673        "type_item" | "type_alias_declaration" | "type_alias" => 27,
3674        "mod_item" | "module" | "namespace" => 26,
3675        // Tier 2: behavior — what THIS file does
3676        "function_item" | "function_definition" | "function" | "method_definition" => 20,
3677        "impl_item" | "impl" => 19,
3678        // Tier 1: declarations
3679        "const_item" | "const_declaration" | "const" => 10,
3680        "static_item" | "static" => 9,
3681        // Tier 0: internals (fields, variables, parameters)
3682        _ => 0,
3683    }
3684}
3685
3686/// Effective AST priority with corpus-relative rank promotion (4.0.4).
3687///
3688/// Preserves the 4.0.2 AST-priority ordering by default (types first,
3689/// then functions, then fields). When a def's PageRank significantly
3690/// exceeds the corpus median, promotes it up one or two tiers so that
3691/// load-bearing defs surface alongside their declared-tier neighbors.
3692///
3693/// Thresholds are corpus-median multiples (self-calibrating):
3694/// - rank > 4× median       → +1 tier (e.g., hot function joins type tier)
3695/// - rank > 16× median      → +2 tiers (extremely hot def)
3696/// - otherwise              → declared tier preserved
3697///
3698/// On degenerate (flat) rank distributions the median equals the floor,
3699/// nothing crosses threshold, and 4.0.2 AST-priority ordering is fully
3700/// preserved. On informative distributions (post-4.0.3 enrichment),
3701/// hot defs surface proportionally.
3702fn effective_priority(kind: &str, def_rank: f32, promo_1: f32, promo_2: f32) -> u32 {
3703    let base = ast_kind_priority(kind);
3704    // Accumulate promotion tiers as a plain integer to satisfy clippy's
3705    // bool_to_int_with_if lint while preserving branch clarity.
3706    let promo_tiers: u32 = u32::from(def_rank > promo_1) + u32::from(def_rank > promo_2);
3707    // Tier spacing matches ast_kind_priority's 10-unit gaps.
3708    base + promo_tiers * 10
3709}
3710
3711/// Estimate the serialised JSON byte cost of one `RepoMapSymbol`.
3712///
3713/// Calibrated against actual serde_json output. A `RepoMapSymbol` serialises to
3714/// approximately:
3715/// `{"name":"<N>","kind":<K>,"lsp_location":{"file_path":"<P>","start_line":0,`
3716/// `"start_character":0,"end_line":0,"end_character":0},"rank":<R>}`
3717///
3718/// That is ~165 bytes of overhead (braces, keys, fixed-width integers, rank)
3719/// plus the name length and file_path length. We pass the path length
3720/// separately because the path is the same for all symbols in one file.
3721fn estimate_symbol_bytes(name: &str) -> usize {
3722    // 165 bytes overhead + name length.
3723    // The file_path is not included here because it is part of the
3724    // envelope cost accounted separately.
3725    165 + name.len()
3726}
3727
3728/// Estimate the serialised JSON byte cost of one `RepoMapCall`.
3729///
3730/// Each call entry: `{"lsp_location":{"file_path":"<P>","start_line":0,`
3731/// `"start_character":0,"end_line":0,"end_character":0},"rank":<R>}`
3732/// ≈ 120 bytes overhead + path length.
3733fn estimate_call_bytes(target_path: &str) -> usize {
3734    120 + target_path.len()
3735}
3736
3737/// Render a `PageRank`-weighted JSON map with token-budget allocation (4.0.1).
3738///
3739/// # Algorithm
3740///
3741/// **Step 1 — File-share allocation.** Each eligible file receives a byte
3742/// budget proportional to its `base_rank`. The share is capped at 40% of
3743/// `budget_total_bytes` and floored at [`FILE_ENVELOPE_MIN_BYTES`] (200 B).
3744/// Files are included in rank order until the cumulative allocation would
3745/// exceed the total budget.
3746///
3747/// **Step 2 — Per-file symbol fill.** For each included file, symbols are
3748/// walked in def-rank descending order. Inclusion continues until either (a)
3749/// the file's budget share is exhausted (with carry-over of leftover bytes to
3750/// the next file) or (b) a logarithmic attenuation cutoff fires: symbol at
3751/// position `i` (0-based) is included only if its rank ≥ `top_rank /
3752/// (1 + ln(i + 1))`. The same algorithm fills `calls[]` in target-file
3753/// base-rank order. `truncated_symbols` and `truncated_calls` track the
3754/// count of omitted entries.
3755///
3756/// **Step 3 — Response telemetry.** The response includes `estimated_bytes`
3757/// (actual returned content size), `budget_bytes` (`token_budget * 4`),
3758/// and `budget_exhausted` (`total_files > files.len()`).
3759///
3760/// # Arguments
3761///
3762/// - `graph` — the built dependency graph.
3763/// - `token_budget` — caller-specified token budget (× 4 = byte budget).
3764/// - `focus` — optional file index for topic-sensitive `PageRank`.
3765/// - `include_metadata` — when `false` (default), Meta-classified files
3766///   are excluded before ranking.
3767#[must_use]
3768#[expect(
3769    clippy::cast_precision_loss,
3770    reason = "rank sums and counts are small f32/f64; precision loss is acceptable"
3771)]
3772#[expect(
3773    clippy::too_many_lines,
3774    reason = "the three-step allocation algorithm (file-share → symbol-fill → calls-fill) \
3775              is sequential and share state; splitting into helpers would require passing \
3776              mutable slices across three boundaries with no clarity gain"
3777)]
3778pub fn render_json_budgeted(
3779    graph: &RepoGraph,
3780    token_budget: usize,
3781    focus: Option<usize>,
3782    include_metadata: bool,
3783) -> GetRepoMapResponse {
3784    let n = graph.files.len();
3785    if n == 0 {
3786        let budget_bytes = token_budget * CHARS_PER_TOKEN;
3787        return GetRepoMapResponse {
3788            files: vec![],
3789            total_files: 0,
3790            estimated_bytes: 0,
3791            budget_bytes,
3792            budget_exhausted: false,
3793            capped: false,
3794        };
3795    }
3796
3797    let budget_total_bytes = token_budget * CHARS_PER_TOKEN;
3798
3799    // Recompute topic-sensitive ranks if focus is given.
3800    let ranks = if focus.is_some() {
3801        pagerank(n, &graph.edges, focus)
3802    } else {
3803        graph.base_ranks.clone()
3804    };
3805
3806    // Sort all file indices by rank descending.
3807    let mut sorted: Vec<usize> = (0..n).collect();
3808    sorted.sort_by(|&a, &b| ranks[b].total_cmp(&ranks[a]));
3809
3810    // Apply metadata exclusion filter.
3811    let eligible: Vec<usize> = if include_metadata {
3812        sorted
3813    } else {
3814        sorted
3815            .into_iter()
3816            .filter(|&idx| {
3817                let kind = content_kind_for_path(&graph.files[idx].path);
3818                kind != ContentKind::Meta
3819            })
3820            .collect()
3821    };
3822
3823    let total_files = eligible.len();
3824
3825    // ── Corpus-median def-rank thresholds for tier promotion (4.0.4) ────────
3826    //
3827    // Compute once per call (corpus-wide, not per-file) so the threshold is
3828    // self-calibrating: flat distributions (all ranks equal) set median = floor
3829    // and nothing crosses threshold; informative distributions see proportional
3830    // promotion. Using corpus-wide median ensures a hot function in one file is
3831    // judged against the entire corpus, not just its local file peers.
3832    // Use the 75th percentile of nonzero def-ranks as the corpus reference value
3833    // for tier promotion (rather than the 50th percentile / median). The 75th
3834    // percentile is more robust: on a flat distribution most defs cluster near the
3835    // floor, so the 75th percentile is only marginally above the floor (making the
3836    // 4× threshold very selective). On an informative distribution (post-4.0.3
3837    // call-edge enrichment) the 75th percentile is meaningfully above the floor,
3838    // so the same 4× multiplier captures genuinely hot defs without falsely
3839    // promoting slightly-above-floor helpers.
3840    //
3841    // The 50th percentile (lower median) was rejected because on a 10-def corpus
3842    // with max/min ratio 5× the median equals the floor, causing the 4× threshold
3843    // to fire on defs that are only 5× above floor (a low-variance corpus). The
3844    // 75th percentile corrects this without requiring hand-tuned per-corpus magic
3845    // numbers.
3846    let corpus_reference_rank: f32 = {
3847        let mut nonzero: Vec<f32> = graph
3848            .def_ranks
3849            .iter()
3850            .copied()
3851            .filter(|r| *r > 0.0)
3852            .collect();
3853        if nonzero.is_empty() {
3854            0.0
3855        } else {
3856            nonzero.sort_unstable_by(f32::total_cmp);
3857            let n = nonzero.len();
3858            // 75th percentile index: floor(0.75 * (n - 1))
3859            let idx = (3 * (n - 1)) / 4;
3860            nonzero[idx]
3861        }
3862    };
3863    let promo_1_threshold = corpus_reference_rank * 4.0; // +1 tier
3864    let promo_2_threshold = corpus_reference_rank * 16.0; // +2 tiers
3865
3866    // ── Step 1: File-share allocation ────────────────────────────────
3867
3868    // Greedily determine which files fit within the budget, computing each
3869    // file's share as it is added. We must run a two-pass approach:
3870    //   pass A: determine which files are included (cumulative sum check),
3871    //   pass B: fill symbols/calls using final per-file allocations.
3872    //
3873    // The "included" decision is based on the running cumulative sum so that
3874    // the leftover redistribution in step 2 can carry forward correctly.
3875
3876    // Floor-first admission (Fix A, 4.0.2):
3877    //
3878    // Cap admitted file count so each gets at least FILE_MIN_USEFUL_BYTES.
3879    // Below this threshold the response would carry envelopes that contain
3880    // no symbols or calls — pure overhead, no information. Concentrating
3881    // the budget on fewer files with real content is strictly better for
3882    // orientation than dropping envelope sentinels for many files.
3883    let max_admissible = budget_total_bytes / FILE_MIN_USEFUL_BYTES;
3884    let admit_count = eligible.len().min(max_admissible.max(1));
3885
3886    let budget_f64 = budget_total_bytes as f64;
3887
3888    // Pre-compute rank sum across ADMITTED files only (top-N by rank). f64
3889    // to avoid precision loss when summing many small f32 values.
3890    let admitted_rank_sum: f64 = eligible
3891        .iter()
3892        .take(admit_count)
3893        .map(|&idx| f64::from(ranks[idx]))
3894        .sum();
3895    let admitted_rank_sum = if admitted_rank_sum > 0.0 {
3896        admitted_rank_sum
3897    } else {
3898        1.0
3899    };
3900
3901    // Compute per-file budgets. Each admitted file gets at least
3902    // FILE_MIN_USEFUL_BYTES; the proportional-to-rank share is applied on
3903    // top of the floor and capped at MAX_FILE_SHARE.
3904    let mut included_indices: Vec<usize> = Vec::new(); // indices into `eligible`
3905    let mut file_budgets: Vec<usize> = Vec::new();
3906    let mut cumulative_budget: usize = 0;
3907
3908    for (i, &file_idx) in eligible.iter().take(admit_count).enumerate() {
3909        let file_rank = f64::from(ranks[file_idx]);
3910        let raw_share = budget_f64 * file_rank / admitted_rank_sum;
3911        let capped = raw_share.min(budget_f64 * MAX_FILE_SHARE);
3912        // `capped` is non-negative and bounded by budget_f64 (a usize).
3913        #[expect(
3914            clippy::cast_possible_truncation,
3915            clippy::cast_sign_loss,
3916            reason = "capped is non-negative and bounded by budget_total_bytes (a usize)"
3917        )]
3918        let budget_i = (capped as usize).max(FILE_MIN_USEFUL_BYTES);
3919
3920        if cumulative_budget + budget_i > budget_total_bytes && !included_indices.is_empty() {
3921            break;
3922        }
3923        cumulative_budget += budget_i;
3924        included_indices.push(i);
3925        file_budgets.push(budget_i);
3926    }
3927
3928    // ── Step 2: Per-file symbol fill ─────────────────────────────────
3929
3930    let mut result_files: Vec<RepoMapFile> = Vec::with_capacity(included_indices.len());
3931    let mut leftover: usize = 0; // unused bytes carried from previous file
3932
3933    for (slot, &eligible_i) in included_indices.iter().enumerate() {
3934        let file_idx = eligible[eligible_i];
3935        let file = &graph.files[file_idx];
3936        let file_rank = ranks[file_idx];
3937        let file_path_lsp = file_lsp_location(&file.path);
3938
3939        let budget_in = file_budgets[slot] + leftover;
3940
3941        // Reserve a fraction of the post-envelope budget for outgoing calls
3942        // (Fix C, 4.0.2). Without this guard the symbol loop saturates
3943        // `budget_in` and the calls loop always trips its byte-check.
3944        // Symbol leftover flows into calls; call leftover flows to the
3945        // next file via the outer `leftover` variable.
3946        let post_envelope = budget_in.saturating_sub(FILE_ENVELOPE_MIN_BYTES);
3947        #[expect(
3948            clippy::cast_possible_truncation,
3949            clippy::cast_sign_loss,
3950            reason = "post_envelope * fraction is bounded by post_envelope (a usize)"
3951        )]
3952        let calls_reserve = (post_envelope as f64 * CALLS_BUDGET_FRACTION) as usize;
3953        let symbols_budget = FILE_ENVELOPE_MIN_BYTES + post_envelope.saturating_sub(calls_reserve);
3954        let mut used: usize = FILE_ENVELOPE_MIN_BYTES; // envelope cost
3955
3956        // ── Symbols ──────────────────────────────────────────────────
3957        // Retrieve def-level ranks for this file via the offset table.
3958        let def_count = file.defs.len();
3959        let def_offset = if file_idx < graph.def_offsets.len() {
3960            graph.def_offsets[file_idx]
3961        } else {
3962            0
3963        };
3964
3965        // Build (def_idx, rank, kind_priority, start_byte) tuples. We sort
3966        // by a composite key: AST kind priority (descending) — putting types
3967        // before functions before fields — then by def_rank (descending)
3968        // within each tier. This is Fix B (4.0.2): the def_rank distribution
3969        // is often degenerate (most defs share near-zero rank because the
3970        // call-edge extractor doesn't capture every dispatch), so we use
3971        // structural signal as the primary ordering and def_rank as the
3972        // within-tier tiebreaker. When def_rank IS informative, it dominates
3973        // *within* its kind tier and recovers the original behavior; the AST
3974        // signal only shifts ordering *between* tiers.
3975        let mut def_rank_pairs: Vec<(usize, f32, u32, u32)> = (0..def_count)
3976            .map(|di| {
3977                let flat = def_offset + di;
3978                let r = graph.def_ranks.get(flat).copied().unwrap_or(0.0);
3979                // Store the ORIGINAL ast_kind_priority in the tuple (used by the
3980                // per-tier attenuation loop below). The sort comparator uses
3981                // effective_priority (which may be higher due to 4.0.4 promotion)
3982                // to reorder hot defs ahead of cold type-tier defs, while the
3983                // attenuation tier tracker continues to use the original AST tier
3984                // so the existing per-tier cutoff behaviour is preserved.
3985                let kind_prio = ast_kind_priority(&file.defs[di].kind);
3986                let decl_order = file.defs[di].start_byte;
3987                (di, r, kind_prio, decl_order)
3988            })
3989            .collect();
3990        def_rank_pairs.sort_unstable_by(|a, b| {
3991            // Primary: effective priority (4.0.4: AST kind + corpus-rank promotion) descending.
3992            // Hot defs that exceed corpus-median thresholds are promoted above their
3993            // declared tier so they surface before cold type-tier defs.
3994            let eff_a = effective_priority(
3995                &file.defs[a.0].kind,
3996                a.1,
3997                promo_1_threshold,
3998                promo_2_threshold,
3999            );
4000            let eff_b = effective_priority(
4001                &file.defs[b.0].kind,
4002                b.1,
4003                promo_1_threshold,
4004                promo_2_threshold,
4005            );
4006            eff_b
4007                .cmp(&eff_a)
4008                // Secondary: def_rank descending within tier.
4009                .then_with(|| b.1.total_cmp(&a.1))
4010                // Tertiary: earlier declaration order (stable, deterministic).
4011                .then_with(|| a.3.cmp(&b.3))
4012        });
4013
4014        let top_def_rank = def_rank_pairs.first().map(|&(_, r, _, _)| r).unwrap_or(0.0);
4015
4016        let mut symbols: Vec<RepoMapSymbol> = Vec::new();
4017        let mut truncated_symbols: usize = 0;
4018
4019        // Track per-tier position for the attenuation cutoff. When AST kind
4020        // priority changes (we've moved from types to functions, say), reset
4021        // the position so the attenuation curve restarts. Otherwise a
4022        // structurally-equivalent-but-later tier would be unfairly cut.
4023        let mut tier_pos: usize = 0;
4024        let mut current_tier: Option<u32> = None;
4025        let mut tier_top_rank: f32 = top_def_rank;
4026
4027        for (pos, &(di, def_r, kind_prio, _)) in def_rank_pairs.iter().enumerate() {
4028            // Reset attenuation at tier boundaries.
4029            if current_tier != Some(kind_prio) {
4030                current_tier = Some(kind_prio);
4031                tier_pos = 0;
4032                tier_top_rank = def_r;
4033            }
4034
4035            // Logarithmic attenuation cutoff, relative to the tier's top rank.
4036            let cutoff = if tier_top_rank > 0.0 {
4037                tier_top_rank / (1.0 + (tier_pos as f32 + 1.0).ln())
4038            } else {
4039                0.0
4040            };
4041            if def_r < cutoff {
4042                // Attenuation cuts the rest of THIS tier; we don't stop
4043                // entirely because the next tier may still have useful
4044                // content within its own attenuation curve. Skip this def.
4045                truncated_symbols += 1;
4046                tier_pos += 1;
4047                continue;
4048            }
4049
4050            let def = &file.defs[di];
4051            let sym_bytes = estimate_symbol_bytes(&def.name);
4052            // Use the reserved symbols sub-budget (Fix C) so calls aren't
4053            // starved when symbols would otherwise saturate budget_in.
4054            if used + sym_bytes > symbols_budget {
4055                truncated_symbols += def_rank_pairs.len() - pos;
4056                break;
4057            }
4058
4059            let kind = crate::languages::lsp_symbol_kind_for_node_kind(&def.kind);
4060            let line_0 = def.start_line.saturating_sub(1) as usize;
4061            symbols.push(RepoMapSymbol {
4062                name: def.name.clone(),
4063                kind,
4064                lsp_location: RepoMapLspLocation {
4065                    file_path: file_path_lsp.file_path.clone(),
4066                    start_line: line_0,
4067                    start_character: 0,
4068                    end_line: line_0,
4069                    end_character: 0,
4070                },
4071                rank: def_r,
4072            });
4073            used += sym_bytes;
4074            tier_pos += 1;
4075        }
4076
4077        // ── Calls ─────────────────────────────────────────────────────
4078        // Gather outgoing callees sorted by target file base_rank descending.
4079        let callee_indices: Vec<usize> = if file_idx < graph.callees.len() {
4080            let mut callees: Vec<(usize, f32)> = graph.callees[file_idx]
4081                .iter()
4082                .filter_map(|&ci| {
4083                    let ci = ci as usize;
4084                    graph.files.get(ci).map(|_| {
4085                        let r = graph.base_ranks.get(ci).copied().unwrap_or(0.0);
4086                        (ci, r)
4087                    })
4088                })
4089                .collect();
4090            callees.sort_unstable_by(|a, b| b.1.total_cmp(&a.1));
4091            callees.into_iter().map(|(ci, _)| ci).collect()
4092        } else {
4093            vec![]
4094        };
4095
4096        let call_total = callee_indices.len();
4097        let top_call_rank = callee_indices
4098            .first()
4099            .and_then(|&ci| graph.base_ranks.get(ci))
4100            .copied()
4101            .unwrap_or(0.0);
4102
4103        let mut calls: Vec<RepoMapCall> = Vec::new();
4104        let mut truncated_calls: usize = 0;
4105
4106        for (pos, &ci) in callee_indices.iter().enumerate() {
4107            let callee_rank = graph.base_ranks.get(ci).copied().unwrap_or(0.0);
4108
4109            // Logarithmic attenuation cutoff on target rank.
4110            let cutoff = if top_call_rank > 0.0 {
4111                top_call_rank / (1.0 + (pos as f32 + 1.0).ln())
4112            } else {
4113                0.0
4114            };
4115            if callee_rank < cutoff {
4116                truncated_calls += call_total - pos;
4117                break;
4118            }
4119
4120            let callee_path = &graph.files[ci].path;
4121            let call_bytes = estimate_call_bytes(callee_path);
4122            if used + call_bytes > budget_in {
4123                truncated_calls += call_total - pos;
4124                break;
4125            }
4126
4127            calls.push(RepoMapCall {
4128                lsp_location: file_lsp_location(callee_path),
4129                rank: callee_rank,
4130            });
4131            used += call_bytes;
4132        }
4133
4134        // Carry unused bytes forward to the next file.
4135        leftover = budget_in.saturating_sub(used);
4136
4137        result_files.push(RepoMapFile {
4138            lsp_location: file_path_lsp,
4139            rank: file_rank,
4140            content_kind: content_kind_tag(content_kind_for_path(&file.path)),
4141            calls,
4142            symbols,
4143            truncated_symbols,
4144            truncated_calls,
4145        });
4146    }
4147
4148    let estimated_bytes = serde_json::to_string(&result_files)
4149        .map(|s| s.len())
4150        .unwrap_or(0);
4151
4152    let budget_exhausted = total_files > result_files.len();
4153
4154    GetRepoMapResponse {
4155        files: result_files,
4156        total_files,
4157        estimated_bytes,
4158        budget_bytes: budget_total_bytes,
4159        budget_exhausted,
4160        capped: budget_exhausted,
4161    }
4162}
4163
4164/// Render a `PageRank`-sorted JSON map of the repository (4.0.0 compatibility shim).
4165///
4166/// This function wraps [`render_json_budgeted`] with a synthetic token budget
4167/// derived from `max_files * 2000` (a generous per-file allowance). It exists
4168/// to keep the existing D1/D2 unit tests compiling without change; the MCP
4169/// layer calls [`render_json_budgeted`] directly in 4.0.1.
4170///
4171/// The `capped` field in the response reflects whether the budget was
4172/// exhausted before all `eligible` files were included, which is equivalent
4173/// to the previous `total_files > max_files` check.
4174///
4175/// When `include_metadata` is `false` (default), files whose extension
4176/// classifies as [`ContentKind::Meta`] are excluded before ranking.
4177#[must_use]
4178pub fn render_json(
4179    graph: &RepoGraph,
4180    max_files: usize,
4181    focus: Option<usize>,
4182    include_metadata: bool,
4183) -> GetRepoMapResponse {
4184    // Synthesise a generous token budget: 2000 tokens per requested file.
4185    // This ensures the existing D1/D2 tests (which pass small max_files values
4186    // like 3, 5, 50) see the same file-count behaviour they expect. The test
4187    // assertions check file counts, not byte sizes, so the exact budget value
4188    // only matters for ensuring enough headroom.
4189    let token_budget = max_files.saturating_mul(2000);
4190    render_json_budgeted(graph, token_budget, focus, include_metadata)
4191}
4192
4193// ── Tests ────────────────────────────────────────────────────────────
4194
4195#[cfg(test)]
4196mod tests {
4197    use super::*;
4198
4199    #[test]
4200    fn test_pagerank_simple() {
4201        // 3-node graph: 0 -> 1 -> 2, 2 -> 0 (cycle)
4202        let edges = vec![(0, 1, 1), (1, 2, 1), (2, 0, 1)];
4203        let ranks = pagerank(3, &edges, None);
4204
4205        // All nodes in a symmetric cycle should have equal rank
4206        assert_eq!(ranks.len(), 3);
4207        let sum: f32 = ranks.iter().sum();
4208        assert!(
4209            (sum - 1.0).abs() < 0.01,
4210            "ranks should sum to ~1.0, got {sum}"
4211        );
4212
4213        // In a perfect cycle, all ranks should be approximately equal
4214        let expected = 1.0 / 3.0;
4215        for (i, &r) in ranks.iter().enumerate() {
4216            assert!(
4217                (r - expected).abs() < 0.05,
4218                "rank[{i}] = {r}, expected ~{expected}"
4219            );
4220        }
4221    }
4222
4223    #[test]
4224    fn test_pagerank_star() {
4225        // Star graph: 0,1,2 all point to 3
4226        let edges = vec![(0, 3, 1), (1, 3, 1), (2, 3, 1)];
4227        let ranks = pagerank(4, &edges, None);
4228
4229        assert_eq!(ranks.len(), 4);
4230        // Node 3 should have the highest rank
4231        let max_idx = ranks
4232            .iter()
4233            .enumerate()
4234            .max_by(|a, b| a.1.total_cmp(b.1))
4235            .unwrap()
4236            .0;
4237        assert_eq!(max_idx, 3, "node 3 should have highest rank");
4238        assert!(
4239            ranks[3] > ranks[0],
4240            "rank[3]={} should be > rank[0]={}",
4241            ranks[3],
4242            ranks[0]
4243        );
4244    }
4245
4246    #[test]
4247    fn test_pagerank_topic_sensitive() {
4248        // 10-node chain: 0 -> 1 -> ... -> 9.
4249        //
4250        // With PERSONALIZATION_ALPHA = 0.15 and n = 10, the uniform share per
4251        // node is 1/10 = 0.10.  The focus node (0) gets 0.15 teleportation
4252        // mass vs 0.10 uniform, so focused rank[0] > uniform rank[0] holds.
4253        //
4254        // The 3-node chain used previously broke when alpha was reduced from
4255        // 0.70 to 0.15 because 0.15 < 1/3 = 0.33 for n=3 — the focus node
4256        // received *less* teleportation than its uniform share, inverting the
4257        // expected direction.  Using n=10 avoids this edge case while still
4258        // testing the personalization effect.
4259        let n = 10_usize;
4260        #[expect(clippy::cast_possible_truncation, reason = "test: n << u32::MAX")]
4261        let edges: Vec<(u32, u32, u32)> = (0..(n - 1))
4262            .map(|i| (i as u32, (i + 1) as u32, 1_u32))
4263            .collect();
4264        let uniform_ranks = pagerank(n, &edges, None);
4265        let biased_ranks = pagerank(n, &edges, Some(0));
4266
4267        // With focus on node 0, it should get a higher rank than uniform
4268        // because PERSONALIZATION_ALPHA (0.15) > 1/n (0.10) for n=10.
4269        assert!(
4270            biased_ranks[0] > uniform_ranks[0],
4271            "focused rank[0]={} should be > uniform rank[0]={}",
4272            biased_ranks[0],
4273            uniform_ranks[0]
4274        );
4275    }
4276
4277    // ── J1 tests — topic-sensitive PageRank soft personalization ─────────
4278
4279    /// J1 RED: `focus_file` PageRank must not collapse other-file ranks.
4280    ///
4281    /// Baseline (pre-4.0.5) concentrated 70% mass on the focus node, producing
4282    /// a degenerate Dirac delta: focus rank ≈ 0.703, all others ≈ 0.003.
4283    /// This test fails on the baseline and must pass after the fix.
4284    ///
4285    /// Invariant: with `PERSONALIZATION_ALPHA = 0.15`, focus node gets 0.15 of
4286    /// teleportation mass and each of the other (n-1) nodes gets 0.85/(n-1).
4287    /// On a star graph with n=10 nodes, the focus node rank must NOT be more
4288    /// than 40× the average non-focus rank.  The 4.0.5 fix targets roughly
4289    /// 5-10× for a well-connected graph, so 40× is a conservative upper bound
4290    /// that the baseline (≈200×) fails.
4291    #[test]
4292    fn test_focus_file_topic_pagerank_preserves_rank_dispersion() {
4293        // Star graph: nodes 1..9 all point to node 0 (high natural rank).
4294        // Focus on node 1 (low natural rank) to test personalization effect.
4295        let n = 10_usize;
4296        #[expect(clippy::cast_possible_truncation, reason = "test: n << u32::MAX")]
4297        let edges: Vec<(u32, u32, u32)> = (1..n).map(|i| (i as u32, 0_u32, 1_u32)).collect();
4298
4299        let ranks_focused = pagerank(n, &edges, Some(1));
4300
4301        let focus_rank = ranks_focused[1];
4302        let sum_non_focus: f32 = ranks_focused
4303            .iter()
4304            .enumerate()
4305            .filter(|&(i, _)| i != 1)
4306            .map(|(_, &r)| r)
4307            .sum();
4308        let n_non_focus = (n - 1) as f32;
4309        let avg_non_focus = sum_non_focus / n_non_focus;
4310
4311        let dispersion_ratio = focus_rank / avg_non_focus;
4312
4313        eprintln!(
4314            "J1 dispersion: focus_rank={focus_rank:.6}, avg_non_focus={avg_non_focus:.6}, \
4315             ratio={dispersion_ratio:.2}× (must be <= 40×)"
4316        );
4317
4318        // With 0.15 personalization alpha the focus node's teleportation
4319        // advantage is modest; 40× is an upper bound the old 0.70 code violates.
4320        assert!(
4321            dispersion_ratio <= 40.0,
4322            "focus rank is {dispersion_ratio:.1}× avg non-focus rank (must be ≤ 40×); \
4323             pre-fix baseline was ~200× due to 70% concentration — I#16"
4324        );
4325
4326        // Ranks must still sum to ~1.
4327        let total: f32 = ranks_focused.iter().sum();
4328        assert!(
4329            (total - 1.0).abs() < 0.01,
4330            "ranks must sum to ≈1.0; got {total}"
4331        );
4332    }
4333
4334    /// J1 RED: focus node must have the highest rank (it still gets the bias),
4335    /// but non-focus nodes must NOT collapse to a flat floor.
4336    ///
4337    /// Concretely: the second-highest-ranked file must be ≥ 10% of the focus
4338    /// file's rank (neighborhood rebiasing, not winner-take-all).
4339    #[test]
4340    fn test_focus_file_topic_pagerank_does_not_collapse_other_files() {
4341        // Linear chain: 0 → 1 → 2 → ... → 9 (directed).
4342        // Focus on node 0.  Without personalization, ranks decrease along the
4343        // chain.  With soft personalization the non-focus nodes stay non-trivial.
4344        let n = 10_usize;
4345        #[expect(clippy::cast_possible_truncation, reason = "test: n << u32::MAX")]
4346        let edges: Vec<(u32, u32, u32)> = (0..(n - 1))
4347            .map(|i| (i as u32, (i + 1) as u32, 1_u32))
4348            .collect();
4349
4350        let ranks = pagerank(n, &edges, Some(0));
4351
4352        let focus_rank = ranks[0];
4353        // All non-focus ranks must be ≥ 10% of focus rank.
4354        for (i, &r) in ranks.iter().enumerate().skip(1) {
4355            assert!(
4356                r >= focus_rank * 0.10,
4357                "rank[{i}] = {r:.6} is < 10% of focus rank {focus_rank:.6}; \
4358                 non-focus files must not collapse to near-zero (I#16)"
4359            );
4360        }
4361    }
4362
4363    // ── J2 tests — neighborhood count parity ─────────────────────────────
4364
4365    /// J2 RED: `render_json_budgeted` with `focus=Some(i)` must return at
4366    /// least 80% as many files as the unfocused call with the same budget.
4367    ///
4368    /// Baseline collapses the focused run to 1 dominant file + a flat tail
4369    /// that may still appear, but the intent is no budget waste on zero-signal
4370    /// entries.  With soft personalization the focused call should return a
4371    /// similar file count to the unfocused call (within ±20%).
4372    #[test]
4373    fn test_focus_file_returns_neighborhood_not_just_focus() {
4374        // Build a 12-file star graph with meaningful rank variation.
4375        let n = 12_usize;
4376        #[expect(clippy::cast_possible_truncation, reason = "test: n << u32::MAX")]
4377        let edges: Vec<(u32, u32, u32)> = (1..n).map(|i| (i as u32, 0_u32, 1_u32)).collect();
4378        let base_ranks = pagerank(n, &edges, None);
4379        let (callers, callees) = build_neighbor_lists(n, &edges);
4380
4381        let file_nodes: Vec<FileNode> = (0..n)
4382            .map(|i| FileNode {
4383                path: format!("src/file_{i}.rs"),
4384                defs: vec![Definition {
4385                    name: format!("func_{i}"),
4386                    kind: "function_item".to_string(),
4387                    start_line: 1,
4388                    end_line: 5,
4389                    scope: String::new(),
4390                    signature: Some(format!("fn func_{i}() -> i32")),
4391                    start_byte: 0,
4392                    end_byte: 100,
4393                    calls: vec![],
4394                }],
4395                imports: vec![],
4396            })
4397            .collect();
4398
4399        let graph = RepoGraph {
4400            files: file_nodes,
4401            edges,
4402            base_ranks,
4403            callers,
4404            callees,
4405            def_edges: vec![],
4406            def_ranks: vec![],
4407            def_callers: vec![],
4408            def_callees: vec![],
4409            def_offsets: vec![0],
4410            alpha: 0.5,
4411        };
4412
4413        let budget = 2000; // generous budget; all 12 files should fit
4414        let unfocused = render_json_budgeted(&graph, budget, None, false);
4415        let focused = render_json_budgeted(&graph, budget, Some(1), false);
4416
4417        let unfocused_n = unfocused.files.len();
4418        let focused_n = focused.files.len();
4419        #[expect(
4420            clippy::cast_possible_truncation,
4421            clippy::cast_sign_loss,
4422            reason = "unfocused_n is a file count (small, positive); f32 multiplication \
4423                      by 0.80 and ceil produce a value in [0, n]; truncation to usize is safe"
4424        )]
4425        let min_expected = (unfocused_n as f32 * 0.80).ceil() as usize;
4426
4427        eprintln!(
4428            "J2 neighborhood: unfocused={unfocused_n} files, focused={focused_n} files \
4429             (need ≥ {min_expected})"
4430        );
4431
4432        assert!(
4433            focused_n >= min_expected,
4434            "focused call returned {focused_n} files; expected ≥ {min_expected} \
4435             (80% of unfocused {unfocused_n}); soft personalization must preserve \
4436             rank dispersion across files (I#16/J2)"
4437        );
4438    }
4439
4440    /// J2 RED: topic delta fingerprinting — focused run must reorder files
4441    /// relative to unfocused run (focus file surfaces near top), but both
4442    /// must contain similar total file counts.
4443    #[test]
4444    fn test_focus_delta_topic_fingerprinting_works() {
4445        // Bidirectional 8-file ring so all nodes are structurally equivalent.
4446        // Without focus all ranks are equal.  With focus on node 3, node 3
4447        // must surface as the highest-ranked file.
4448        let n = 8_usize;
4449        #[expect(clippy::cast_possible_truncation, reason = "test: n << u32::MAX")]
4450        let edges: Vec<(u32, u32, u32)> = (0..n)
4451            .flat_map(|i| {
4452                let next = ((i + 1) % n) as u32;
4453                let curr = i as u32;
4454                [(curr, next, 1_u32), (next, curr, 1_u32)]
4455            })
4456            .collect();
4457
4458        let ranks_uniform = pagerank(n, &edges, None);
4459        let ranks_focused = pagerank(n, &edges, Some(3));
4460
4461        // Focus node must have highest rank.
4462        let top_idx = ranks_focused
4463            .iter()
4464            .enumerate()
4465            .max_by(|a, b| a.1.total_cmp(b.1))
4466            .map(|(i, _)| i)
4467            .unwrap();
4468
4469        assert_eq!(
4470            top_idx, 3,
4471            "with focus=Some(3), node 3 must have highest rank; top was {top_idx}"
4472        );
4473
4474        // Uniform baseline: all ranks should be approximately equal.
4475        let uniform_max = ranks_uniform
4476            .iter()
4477            .copied()
4478            .fold(f32::NEG_INFINITY, f32::max);
4479        let uniform_min = ranks_uniform.iter().copied().fold(f32::INFINITY, f32::min);
4480        assert!(
4481            (uniform_max - uniform_min).abs() < 0.01,
4482            "on a ring without focus all ranks should be ≈equal; max={uniform_max:.6} min={uniform_min:.6}"
4483        );
4484
4485        // Focused run must rank the focus node significantly higher than others
4486        // but others must remain non-trivial (≥ 5% of focus).
4487        let focus_rank = ranks_focused[3];
4488        for (i, &r) in ranks_focused.iter().enumerate().filter(|&(i, _)| i != 3) {
4489            assert!(
4490                r >= focus_rank * 0.05,
4491                "rank[{i}]={r:.6} is < 5% of focus rank {focus_rank:.6}; \
4492                 soft personalization must preserve non-focus ranks"
4493            );
4494        }
4495    }
4496
4497    // ── T1 tests — focus_file resolver normalization (I#20) ──────────────
4498
4499    /// Build a tiny synthetic graph whose `FileNode::path` values match the
4500    /// shape `build_graph` produces on disk (no leading `./`, forward slashes).
4501    fn focus_resolver_graph() -> RepoGraph {
4502        let file_nodes: Vec<FileNode> = vec![
4503            FileNode {
4504                path: "device_opt/services/storage.py".to_string(),
4505                defs: vec![],
4506                imports: vec![],
4507            },
4508            FileNode {
4509                path: "device_opt/ui/textual/screens/settings.py".to_string(),
4510                defs: vec![],
4511                imports: vec![],
4512            },
4513            FileNode {
4514                path: "device_opt/services/registry.py".to_string(),
4515                defs: vec![],
4516                imports: vec![],
4517            },
4518            FileNode {
4519                path: "tests/test_storage.py".to_string(),
4520                defs: vec![],
4521                imports: vec![],
4522            },
4523        ];
4524        let n = file_nodes.len();
4525        RepoGraph {
4526            files: file_nodes,
4527            edges: vec![],
4528            base_ranks: vec![1.0 / n as f32; n],
4529            callers: vec![vec![]; n],
4530            callees: vec![vec![]; n],
4531            def_edges: vec![],
4532            def_ranks: vec![],
4533            def_callers: vec![],
4534            def_callees: vec![],
4535            def_offsets: vec![0; n + 1],
4536            alpha: 0.5,
4537        }
4538    }
4539
4540    /// T1: focus_file paths emitted by `lsp_location.file_path` (with the
4541    /// `./` prefix) must resolve to the correct file index.
4542    ///
4543    /// Baseline reproduction (mnemosyne corpus, 4.0.5): passing
4544    /// `focus_file="./device_opt/.../settings.py"` produced rank values
4545    /// bit-identical to the unfocused call because the strict-suffix matcher
4546    /// in `tools.rs` failed both the `exact` and the `strict_suffix` checks
4547    /// when the focus carried the LSP `./` prefix. The matcher silently
4548    /// returned `focus = None`, masking the failure as "topic-sensitive
4549    /// PageRank does nothing on Python".
4550    #[test]
4551    fn test_focus_file_resolver_accepts_lsp_location_path() {
4552        let g = focus_resolver_graph();
4553        // LSP-shaped path with leading `./` — the form documented in
4554        // get_repo_map's instructions.
4555        let res = g.resolve_focus_file("./device_opt/ui/textual/screens/settings.py");
4556        match res {
4557            FocusResolution::Found(idx) => {
4558                assert_eq!(
4559                    g.files[idx].path, "device_opt/ui/textual/screens/settings.py",
4560                    "resolver must accept the ./-prefixed LSP path form (I#20)"
4561                );
4562            }
4563            FocusResolution::NotFound | FocusResolution::Ambiguous(_) => {
4564                panic!(
4565                    "resolver returned {res:?} for ./device_opt/ui/textual/screens/settings.py; \
4566                     the LSP-shaped path form must resolve to exactly one file (I#20)"
4567                );
4568            }
4569        }
4570    }
4571
4572    /// T1: the bare stored path (no `./`) must continue to resolve.
4573    /// Regression guard for the pre-fix matcher's "exact" path.
4574    #[test]
4575    fn test_focus_file_resolver_accepts_bare_stored_path() {
4576        let g = focus_resolver_graph();
4577        let res = g.resolve_focus_file("device_opt/services/storage.py");
4578        match res {
4579            FocusResolution::Found(idx) => {
4580                assert_eq!(g.files[idx].path, "device_opt/services/storage.py");
4581            }
4582            other => panic!("expected Found, got {other:?}"),
4583        }
4584    }
4585
4586    /// T1: strict-suffix match — `storage.py` must match
4587    /// `device_opt/services/storage.py` (prev char is `/`) but ambiguity
4588    /// (two `storage.py` files) must be reported, not silently picked.
4589    #[test]
4590    fn test_focus_file_resolver_strict_suffix_and_ambiguity() {
4591        let g = focus_resolver_graph();
4592        // "storage.py" matches both device_opt/services/storage.py and
4593        // tests/test_storage.py? No — test_storage.py has `_` before `storage.py`
4594        // (not `/`), so the strict-suffix matcher rejects it. Only one match.
4595        let res = g.resolve_focus_file("storage.py");
4596        assert!(
4597            matches!(res, FocusResolution::Found(_)),
4598            "strict-suffix `storage.py` must match exactly one file (the `_` in \
4599             test_storage.py blocks the strict-suffix), got {res:?}"
4600        );
4601        // Add a second services/storage.py-shaped file to force ambiguity.
4602        let mut g2 = g.clone();
4603        g2.files.push(FileNode {
4604            path: "vendored/services/storage.py".to_string(),
4605            defs: vec![],
4606            imports: vec![],
4607        });
4608        g2.base_ranks.push(0.0);
4609        g2.callers.push(vec![]);
4610        g2.callees.push(vec![]);
4611        g2.def_offsets.push(*g2.def_offsets.last().unwrap());
4612        let res = g2.resolve_focus_file("storage.py");
4613        match res {
4614            FocusResolution::Ambiguous(cands) => {
4615                assert_eq!(cands.len(), 2, "expected two candidates, got {cands:?}");
4616            }
4617            other => panic!("expected Ambiguous, got {other:?}"),
4618        }
4619    }
4620
4621    /// T1: a focus that matches no file returns `NotFound`. The caller
4622    /// is responsible for either treating this as unfocused or surfacing
4623    /// an error — the resolver itself does not impose policy.
4624    #[test]
4625    fn test_focus_file_resolver_not_found() {
4626        let g = focus_resolver_graph();
4627        let res = g.resolve_focus_file("./does/not/exist.py");
4628        assert!(
4629            matches!(res, FocusResolution::NotFound),
4630            "expected NotFound, got {res:?}"
4631        );
4632    }
4633
4634    /// T1: empty focus does not match anything (avoids the empty-suffix
4635    /// degenerate that would otherwise match every file).
4636    #[test]
4637    fn test_focus_file_resolver_empty_input_is_not_found() {
4638        let g = focus_resolver_graph();
4639        let res = g.resolve_focus_file("");
4640        assert!(
4641            matches!(res, FocusResolution::NotFound),
4642            "empty focus must not match anything, got {res:?}"
4643        );
4644    }
4645
4646    /// T1: focus_file rank delta must be visible on a Python-shaped
4647    /// synthetic graph.
4648    ///
4649    /// Builds a small Python-style call graph (FileNode + Definition with
4650    /// resolved CallRefs, matching what `extract_calls` produces on a real
4651    /// Python corpus), runs `build_graph_from_files_pub` to get a
4652    /// `RepoGraph`, then calls `render_json_budgeted` with and without
4653    /// focus. Asserts that the focused call changes the rank of at least
4654    /// one non-focus file by ≥ 5% in either direction.
4655    ///
4656    /// On the baseline (pre-T1) this test passes when the caller supplies
4657    /// an int focus_idx directly — the engine's topic-sensitive PageRank is
4658    /// correct. The bug was at the string-to-int resolver layer in
4659    /// `tools.rs`, which silently masked the failure as "the rendering
4660    /// path doesn't propagate focus". This test locks the engine's
4661    /// behavior so a future regression in the rendering path is caught.
4662    #[test]
4663    #[expect(
4664        clippy::too_many_lines,
4665        reason = "synthetic Python-shaped graph (five FileNodes with defs + \
4666                  CallRefs + ImportRefs) plus the two-call assertion sequence \
4667                  is inherently long; splitting into helpers would obscure the \
4668                  one-shot reproduction the test is locking in."
4669    )]
4670    fn test_focus_file_rank_delta_visible_on_python_corpus() {
4671        // Five files, Python-shaped: services/storage.py (a "hub" that two
4672        // UI files call into) plus a tests/ file. The Python tree-sitter
4673        // extractor produces `class_definition` and `function_definition`
4674        // kinds with resolved CallRefs pointing at the hub.
4675        let mut files: Vec<FileNode> = vec![
4676            FileNode {
4677                path: "device_opt/services/storage.py".to_string(),
4678                defs: vec![
4679                    Definition {
4680                        name: "ScanStore".to_string(),
4681                        kind: "class_definition".to_string(),
4682                        start_line: 1,
4683                        end_line: 80,
4684                        scope: String::new(),
4685                        signature: None,
4686                        start_byte: 0,
4687                        end_byte: 2000,
4688                        calls: vec![],
4689                    },
4690                    Definition {
4691                        name: "save_scan".to_string(),
4692                        kind: "function_definition".to_string(),
4693                        start_line: 20,
4694                        end_line: 40,
4695                        scope: "class_definition ScanStore".to_string(),
4696                        signature: Some("def save_scan(self, scan)".to_string()),
4697                        start_byte: 200,
4698                        end_byte: 600,
4699                        calls: vec![],
4700                    },
4701                ],
4702                imports: vec![],
4703            },
4704            FileNode {
4705                path: "device_opt/services/registry.py".to_string(),
4706                defs: vec![Definition {
4707                    name: "register".to_string(),
4708                    kind: "function_definition".to_string(),
4709                    start_line: 1,
4710                    end_line: 30,
4711                    scope: String::new(),
4712                    signature: Some("def register(svc)".to_string()),
4713                    start_byte: 0,
4714                    end_byte: 600,
4715                    calls: vec![CallRef {
4716                        name: "save_scan".to_string(),
4717                        qualified_path: None,
4718                        receiver_type: None,
4719                        byte_offset: 100,
4720                        resolved: None,
4721                    }],
4722                }],
4723                imports: vec![ImportRef {
4724                    raw_path: "from device_opt.services import storage".to_string(),
4725                    resolved_idx: Some(0),
4726                }],
4727            },
4728            FileNode {
4729                path: "device_opt/ui/screens/browse.py".to_string(),
4730                defs: vec![Definition {
4731                    name: "browse_scans".to_string(),
4732                    kind: "function_definition".to_string(),
4733                    start_line: 1,
4734                    end_line: 50,
4735                    scope: String::new(),
4736                    signature: Some("def browse_scans(app)".to_string()),
4737                    start_byte: 0,
4738                    end_byte: 1000,
4739                    calls: vec![CallRef {
4740                        name: "save_scan".to_string(),
4741                        qualified_path: None,
4742                        receiver_type: None,
4743                        byte_offset: 200,
4744                        resolved: None,
4745                    }],
4746                }],
4747                imports: vec![ImportRef {
4748                    raw_path: "from device_opt.services import storage".to_string(),
4749                    resolved_idx: Some(0),
4750                }],
4751            },
4752            FileNode {
4753                path: "device_opt/ui/screens/settings.py".to_string(),
4754                defs: vec![Definition {
4755                    name: "open_settings".to_string(),
4756                    kind: "function_definition".to_string(),
4757                    start_line: 1,
4758                    end_line: 40,
4759                    scope: String::new(),
4760                    signature: Some("def open_settings(app)".to_string()),
4761                    start_byte: 0,
4762                    end_byte: 800,
4763                    calls: vec![CallRef {
4764                        name: "register".to_string(),
4765                        qualified_path: None,
4766                        receiver_type: None,
4767                        byte_offset: 150,
4768                        resolved: None,
4769                    }],
4770                }],
4771                imports: vec![ImportRef {
4772                    raw_path: "from device_opt.services import registry".to_string(),
4773                    resolved_idx: Some(1),
4774                }],
4775            },
4776            FileNode {
4777                path: "tests/test_storage.py".to_string(),
4778                defs: vec![Definition {
4779                    name: "test_save".to_string(),
4780                    kind: "function_definition".to_string(),
4781                    start_line: 1,
4782                    end_line: 20,
4783                    scope: String::new(),
4784                    signature: Some("def test_save()".to_string()),
4785                    start_byte: 0,
4786                    end_byte: 400,
4787                    calls: vec![CallRef {
4788                        name: "save_scan".to_string(),
4789                        qualified_path: None,
4790                        receiver_type: None,
4791                        byte_offset: 50,
4792                        resolved: None,
4793                    }],
4794                }],
4795                imports: vec![ImportRef {
4796                    raw_path: "from device_opt.services import storage".to_string(),
4797                    resolved_idx: Some(0),
4798                }],
4799            },
4800        ];
4801
4802        // Resolve calls so the graph builder has edges to chew on.
4803        let def_index = build_def_index(&files);
4804        resolve_calls(&mut files, &def_index, &HashMap::new());
4805        let graph = build_graph_from_files_pub(files);
4806
4807        // Sanity: the graph must have edges (the calls were resolved).
4808        assert!(
4809            !graph.edges.is_empty(),
4810            "Python-shaped synthetic graph must produce file-level edges; got 0. \
4811             The CallRefs may have failed to resolve."
4812        );
4813
4814        // Resolve the focus file via the new helper.
4815        let focus_idx = match graph.resolve_focus_file("./device_opt/ui/screens/settings.py") {
4816            FocusResolution::Found(i) => i,
4817            other => panic!("resolver must find settings.py via LSP-shaped path, got {other:?}"),
4818        };
4819
4820        let budget = 4000;
4821        let unfocused = render_json_budgeted(&graph, budget, None, false);
4822        let focused = render_json_budgeted(&graph, budget, Some(focus_idx), false);
4823
4824        // Collect rank-by-path maps for both runs.
4825        let unfocused_ranks: std::collections::HashMap<String, f32> = unfocused
4826            .files
4827            .iter()
4828            .map(|f| (f.lsp_location.file_path.clone(), f.rank))
4829            .collect();
4830        let focused_ranks: std::collections::HashMap<String, f32> = focused
4831            .files
4832            .iter()
4833            .map(|f| (f.lsp_location.file_path.clone(), f.rank))
4834            .collect();
4835
4836        eprintln!("T1 Python — unfocused ranks: {unfocused_ranks:#?}");
4837        eprintln!("T1 Python — focused ranks:   {focused_ranks:#?}");
4838
4839        // Find at least one non-focus file whose rank changed by ≥ 5% in
4840        // either direction. The threshold is conservative; the soft 0.15
4841        // personalization alpha redistributes mass enough that on this
4842        // 5-node graph the affected neighbors typically shift by 20%+.
4843        let focus_path = "./device_opt/ui/screens/settings.py";
4844        let mut max_delta_ratio = 0.0_f32;
4845        for (path, &u_rank) in &unfocused_ranks {
4846            if path == focus_path {
4847                continue;
4848            }
4849            if let Some(&f_rank) = focused_ranks.get(path)
4850                && u_rank > 0.0
4851            {
4852                let ratio = (f_rank - u_rank).abs() / u_rank;
4853                if ratio > max_delta_ratio {
4854                    max_delta_ratio = ratio;
4855                }
4856            }
4857        }
4858        assert!(
4859            max_delta_ratio >= 0.05,
4860            "focus_file must rebias non-focus file ranks by ≥ 5%; \
4861             max observed delta ratio = {max_delta_ratio:.3} \
4862             (I#20: focus_file invisible on Python corpora)"
4863        );
4864
4865        // Bit-identity guard: at least one non-focus file's rank must NOT
4866        // equal its unfocused value. This is the pathology from the
4867        // mnemosyne reproduction: every rank value was bit-identical
4868        // across global/focused calls.
4869        let any_changed = unfocused_ranks.iter().any(|(path, &u_rank)| {
4870            path != focus_path
4871                && focused_ranks
4872                    .get(path)
4873                    .is_some_and(|&f_rank| f_rank.to_bits() != u_rank.to_bits())
4874        });
4875        assert!(
4876            any_changed,
4877            "no non-focus file rank changed across focused/unfocused calls — \
4878             bit-identical pathology (I#20). unfocused={unfocused_ranks:#?} \
4879             focused={focused_ranks:#?}"
4880        );
4881    }
4882
4883    /// T1: focus_file rank delta on a Rust-shaped synthetic graph.
4884    ///
4885    /// Regression test: confirms the engine's topic-sensitive PageRank
4886    /// works on Rust shapes (where T1's investigation found it already
4887    /// works, but the resolver fix must not break the existing path).
4888    ///
4889    /// This complements `test_focus_file_returns_neighborhood_not_just_focus`
4890    /// by additionally checking that (a) the resolver accepts a Rust path
4891    /// with the `./` LSP prefix, and (b) at least one non-focus file's
4892    /// rank moves by ≥ 5%.
4893    #[test]
4894    #[expect(
4895        clippy::too_many_lines,
4896        reason = "synthetic Rust-shaped graph with four FileNodes plus the \
4897                  two-call assertion sequence inherently exceeds the 100-line \
4898                  cap; the test mirrors the Python-shaped sibling."
4899    )]
4900    fn test_focus_file_rank_delta_preserved_on_rust_corpus() {
4901        let mut files: Vec<FileNode> = vec![
4902            FileNode {
4903                path: "src/lib.rs".to_string(),
4904                defs: vec![Definition {
4905                    name: "Engine".to_string(),
4906                    kind: "struct_item".to_string(),
4907                    start_line: 1,
4908                    end_line: 30,
4909                    scope: String::new(),
4910                    signature: None,
4911                    start_byte: 0,
4912                    end_byte: 600,
4913                    calls: vec![],
4914                }],
4915                imports: vec![],
4916            },
4917            FileNode {
4918                path: "src/encoder/mod.rs".to_string(),
4919                defs: vec![Definition {
4920                    name: "encode".to_string(),
4921                    kind: "function_item".to_string(),
4922                    start_line: 1,
4923                    end_line: 40,
4924                    scope: String::new(),
4925                    signature: Some("fn encode(input: &str) -> Vec<f32>".to_string()),
4926                    start_byte: 0,
4927                    end_byte: 800,
4928                    calls: vec![],
4929                }],
4930                imports: vec![ImportRef {
4931                    raw_path: "use crate::lib;".to_string(),
4932                    resolved_idx: Some(0),
4933                }],
4934            },
4935            FileNode {
4936                path: "src/search.rs".to_string(),
4937                defs: vec![Definition {
4938                    name: "search".to_string(),
4939                    kind: "function_item".to_string(),
4940                    start_line: 1,
4941                    end_line: 30,
4942                    scope: String::new(),
4943                    signature: Some("fn search(q: &str) -> Hits".to_string()),
4944                    start_byte: 0,
4945                    end_byte: 600,
4946                    calls: vec![CallRef {
4947                        name: "encode".to_string(),
4948                        qualified_path: None,
4949                        receiver_type: None,
4950                        byte_offset: 100,
4951                        resolved: None,
4952                    }],
4953                }],
4954                imports: vec![ImportRef {
4955                    raw_path: "use crate::encoder;".to_string(),
4956                    resolved_idx: Some(1),
4957                }],
4958            },
4959            FileNode {
4960                path: "src/cli.rs".to_string(),
4961                defs: vec![Definition {
4962                    name: "main".to_string(),
4963                    kind: "function_item".to_string(),
4964                    start_line: 1,
4965                    end_line: 20,
4966                    scope: String::new(),
4967                    signature: Some("fn main()".to_string()),
4968                    start_byte: 0,
4969                    end_byte: 400,
4970                    calls: vec![CallRef {
4971                        name: "search".to_string(),
4972                        qualified_path: None,
4973                        receiver_type: None,
4974                        byte_offset: 50,
4975                        resolved: None,
4976                    }],
4977                }],
4978                imports: vec![ImportRef {
4979                    raw_path: "use crate::search;".to_string(),
4980                    resolved_idx: Some(2),
4981                }],
4982            },
4983        ];
4984
4985        let def_index = build_def_index(&files);
4986        resolve_calls(&mut files, &def_index, &HashMap::new());
4987        let graph = build_graph_from_files_pub(files);
4988
4989        assert!(
4990            !graph.edges.is_empty(),
4991            "Rust-shaped synthetic graph must produce edges"
4992        );
4993
4994        let focus_idx = match graph.resolve_focus_file("./src/encoder/mod.rs") {
4995            FocusResolution::Found(i) => i,
4996            other => panic!("resolver must find encoder/mod.rs via LSP path, got {other:?}"),
4997        };
4998
4999        let budget = 4000;
5000        let unfocused = render_json_budgeted(&graph, budget, None, false);
5001        let focused = render_json_budgeted(&graph, budget, Some(focus_idx), false);
5002
5003        let unfocused_ranks: std::collections::HashMap<String, f32> = unfocused
5004            .files
5005            .iter()
5006            .map(|f| (f.lsp_location.file_path.clone(), f.rank))
5007            .collect();
5008        let focused_ranks: std::collections::HashMap<String, f32> = focused
5009            .files
5010            .iter()
5011            .map(|f| (f.lsp_location.file_path.clone(), f.rank))
5012            .collect();
5013
5014        eprintln!("T1 Rust — unfocused: {unfocused_ranks:#?}");
5015        eprintln!("T1 Rust — focused:   {focused_ranks:#?}");
5016
5017        let focus_path = "./src/encoder/mod.rs";
5018        let mut max_delta_ratio = 0.0_f32;
5019        for (path, &u_rank) in &unfocused_ranks {
5020            if path == focus_path {
5021                continue;
5022            }
5023            if let Some(&f_rank) = focused_ranks.get(path)
5024                && u_rank > 0.0
5025            {
5026                let ratio = (f_rank - u_rank).abs() / u_rank;
5027                if ratio > max_delta_ratio {
5028                    max_delta_ratio = ratio;
5029                }
5030            }
5031        }
5032        assert!(
5033            max_delta_ratio >= 0.05,
5034            "focus_file must rebias non-focus file ranks by ≥ 5% on Rust shapes; \
5035             max observed delta = {max_delta_ratio:.3}"
5036        );
5037    }
5038
5039    #[test]
5040    fn test_pagerank_empty() {
5041        let ranks = pagerank(0, &[], None);
5042        assert!(ranks.is_empty());
5043    }
5044
5045    #[test]
5046    fn test_render_tiers() {
5047        // Build a small graph with 10 files to exercise all tiers
5048        let files: Vec<FileNode> = (0..10)
5049            .map(|i| FileNode {
5050                path: format!("src/file_{i}.rs"),
5051                defs: vec![Definition {
5052                    name: format!("func_{i}"),
5053                    kind: "function_item".to_string(),
5054                    start_line: 1,
5055                    end_line: 5,
5056                    scope: String::new(),
5057                    signature: Some(format!("func_{i}(x: i32) -> i32")),
5058                    start_byte: 0,
5059                    end_byte: 0,
5060                    calls: vec![],
5061                }],
5062                imports: vec![],
5063            })
5064            .collect();
5065
5066        // Create a star graph: files 1-9 all import from file 0
5067        let edges: Vec<(u32, u32, u32)> = (1..10).map(|i| (i, 0, 1)).collect();
5068        let base_ranks = pagerank(10, &edges, None);
5069        let (top_callers, top_callees) = build_neighbor_lists(10, &edges);
5070
5071        let graph = RepoGraph {
5072            files,
5073            edges,
5074            base_ranks,
5075            callers: top_callers,
5076            callees: top_callees,
5077            def_edges: vec![],
5078            def_ranks: vec![],
5079            def_callers: vec![],
5080            def_callees: vec![],
5081            def_offsets: vec![0],
5082            alpha: 0.5,
5083        };
5084
5085        // Large budget: should include all files
5086        let full = render(&graph, 10_000, None);
5087        assert!(
5088            full.contains("file_0"),
5089            "output should contain the top-ranked file"
5090        );
5091        // file_0 should appear as tier 0 (highest rank)
5092        assert!(
5093            full.contains("## src/file_0.rs"),
5094            "top file should have tier 0 heading"
5095        );
5096
5097        // Tiny budget: should only fit a few files
5098        let small = render(&graph, 10, None);
5099        assert!(
5100            !small.is_empty(),
5101            "even tiny budget should produce some output"
5102        );
5103        // Should have fewer entries than full render
5104        let full_lines = full.lines().count();
5105        let small_lines = small.lines().count();
5106        assert!(
5107            small_lines < full_lines,
5108            "small budget ({small_lines} lines) should have fewer lines than full ({full_lines})"
5109        );
5110    }
5111
5112    #[test]
5113    fn test_render_empty_graph() {
5114        let graph = RepoGraph {
5115            files: vec![],
5116            edges: vec![],
5117            base_ranks: vec![],
5118            callers: vec![],
5119            callees: vec![],
5120            def_edges: vec![],
5121            def_ranks: vec![],
5122            def_callers: vec![],
5123            def_callees: vec![],
5124            def_offsets: vec![0],
5125            alpha: 0.5,
5126        };
5127        let output = render(&graph, 1000, None);
5128        assert!(output.is_empty(), "empty graph should render empty string");
5129    }
5130
5131    #[test]
5132    fn test_build_graph_on_fixtures() {
5133        let fixtures = Path::new(env!("CARGO_MANIFEST_DIR"))
5134            .parent()
5135            .unwrap()
5136            .parent()
5137            .unwrap()
5138            .join("tests")
5139            .join("fixtures");
5140
5141        let graph = build_graph(&fixtures).expect("build_graph should succeed on fixtures");
5142
5143        // Should find at least the 3 fixture files
5144        assert!(
5145            !graph.files.is_empty(),
5146            "graph should contain files from fixtures"
5147        );
5148
5149        // Should find definitions in the Rust fixture
5150        let rs_file = graph.files.iter().find(|f| f.path.ends_with("sample.rs"));
5151        assert!(rs_file.is_some(), "should find sample.rs");
5152        let rs_file = rs_file.unwrap();
5153        assert!(
5154            !rs_file.defs.is_empty(),
5155            "sample.rs should have definitions"
5156        );
5157        assert!(
5158            rs_file.defs.iter().any(|d| d.name == "hello"),
5159            "should find 'hello' function in sample.rs"
5160        );
5161
5162        // Should find definitions in the Python fixture
5163        let py_file = graph.files.iter().find(|f| f.path.ends_with("sample.py"));
5164        assert!(py_file.is_some(), "should find sample.py");
5165        let py_file = py_file.unwrap();
5166        assert!(
5167            !py_file.defs.is_empty(),
5168            "sample.py should have definitions"
5169        );
5170        assert!(
5171            py_file.defs.iter().any(|d| d.name == "greet"),
5172            "should find 'greet' function in sample.py"
5173        );
5174
5175        // PageRank scores should be computed
5176        assert_eq!(graph.base_ranks.len(), graph.files.len());
5177        let sum: f32 = graph.base_ranks.iter().sum();
5178        assert!(
5179            (sum - 1.0).abs() < 0.01,
5180            "PageRank scores should sum to ~1.0, got {sum}"
5181        );
5182    }
5183
5184    #[test]
5185    fn test_extract_imports_rust() {
5186        let source = "use crate::foo::bar;\nuse std::collections::HashMap;\n";
5187        let (lang, query) = import_query_for_extension("rs").unwrap();
5188        let imports = extract_imports(source, &lang, &query);
5189        assert_eq!(imports.len(), 2);
5190        assert!(imports[0].contains("crate::foo::bar"));
5191    }
5192
5193    #[test]
5194    fn test_extract_imports_python_stub() {
5195        let source = "from typing import Protocol\nimport pkg.types\n";
5196        let (lang, query) = import_query_for_extension("pyi").unwrap();
5197        let imports = extract_imports(source, &lang, &query);
5198        assert_eq!(imports.len(), 2);
5199        assert!(imports[0].contains("from typing import Protocol"));
5200        assert!(imports[1].contains("import pkg.types"));
5201    }
5202
5203    #[test]
5204    fn test_resolve_python_import_to_stub_file() {
5205        let root = PathBuf::from("/project");
5206        let mut file_index = HashMap::new();
5207        file_index.insert(PathBuf::from("/project/pkg/types.pyi"), 1);
5208
5209        let result = resolve_python_import("import pkg.types", &root, &file_index);
5210        assert_eq!(result, Some(1));
5211    }
5212
5213    #[test]
5214    fn test_resolve_rust_crate_import() {
5215        let root = PathBuf::from("/project");
5216        let file_path = PathBuf::from("/project/src/main.rs");
5217        let mut file_index = HashMap::new();
5218        file_index.insert(PathBuf::from("/project/src/foo/bar.rs"), 1);
5219        file_index.insert(PathBuf::from("/project/src/main.rs"), 0);
5220
5221        let result = resolve_rust_import("use crate::foo::bar;", &file_path, &root, &file_index);
5222        assert_eq!(result, Some(1));
5223    }
5224
5225    #[test]
5226    fn test_resolve_rust_external_crate_dropped() {
5227        let root = PathBuf::from("/project");
5228        let file_path = PathBuf::from("/project/src/main.rs");
5229        let file_index = HashMap::new();
5230
5231        let result = resolve_rust_import(
5232            "use std::collections::HashMap;",
5233            &file_path,
5234            &root,
5235            &file_index,
5236        );
5237        assert_eq!(result, None, "external crate imports should be dropped");
5238    }
5239
5240    #[test]
5241    fn test_neighbor_lists() {
5242        // 0 -> 1, 0 -> 2, 1 -> 2
5243        let edges = vec![(0, 1, 1), (0, 2, 1), (1, 2, 1)];
5244        let (incoming, outgoing) = build_neighbor_lists(3, &edges);
5245
5246        // Node 2 should be called by 0 and 1
5247        assert!(incoming[2].contains(&0));
5248        assert!(incoming[2].contains(&1));
5249
5250        // Node 0 should call 1 and 2
5251        assert!(outgoing[0].contains(&1));
5252        assert!(outgoing[0].contains(&2));
5253    }
5254
5255    /// G1 (R2.3 issue a): A scoped call `mod_a::foo()` must store:
5256    /// - `name = "foo"` (bare identifier, for def-index lookup)
5257    /// - `qualified_path = Some("mod_a::foo")` (full path, for disambiguation)
5258    ///
5259    /// Before G1, `name` stored the full `"mod_a::foo"` path. After G1, `name`
5260    /// is always the bare trailing identifier and `qualified_path` carries the
5261    /// full path when the call is scoped.
5262    #[test]
5263    fn test_scoped_identifier_calls_preserve_path() {
5264        use crate::languages;
5265        use streaming_iterator::StreamingIterator as _;
5266
5267        let source = "
5268mod mod_a {
5269    pub fn foo() {}
5270}
5271mod mod_b {
5272    pub fn foo() {}
5273}
5274fn caller() {
5275    mod_a::foo();
5276    mod_b::foo();
5277}
5278";
5279        let call_config =
5280            languages::call_query_for_extension("rs").expect("Rust call config must exist");
5281        let lang_config =
5282            languages::config_for_extension("rs").expect("Rust lang config must exist");
5283
5284        let mut defs = {
5285            let mut parser = tree_sitter::Parser::new();
5286            parser.set_language(&lang_config.language).unwrap();
5287            let tree = parser.parse(source, None).unwrap();
5288            let mut cursor = tree_sitter::QueryCursor::new();
5289            let mut out = Vec::new();
5290            let mut matches =
5291                cursor.matches(&lang_config.query, tree.root_node(), source.as_bytes());
5292            while let Some(m) = matches.next() {
5293                let mut name = String::new();
5294                let mut def_node = None;
5295                for cap in m.captures {
5296                    let cname = &lang_config.query.capture_names()[cap.index as usize];
5297                    if *cname == "name" {
5298                        name = source[cap.node.start_byte()..cap.node.end_byte()].to_string();
5299                    } else if *cname == "def" {
5300                        def_node = Some(cap.node);
5301                    }
5302                }
5303                if let Some(node) = def_node {
5304                    #[expect(clippy::cast_possible_truncation)]
5305                    out.push(Definition {
5306                        name,
5307                        kind: node.kind().to_string(),
5308                        start_line: node.start_position().row as u32 + 1,
5309                        end_line: node.end_position().row as u32 + 1,
5310                        scope: String::new(),
5311                        signature: None,
5312                        start_byte: node.start_byte() as u32,
5313                        end_byte: node.end_byte() as u32,
5314                        calls: vec![],
5315                    });
5316                }
5317            }
5318            out
5319        };
5320
5321        extract_calls(source, &call_config, &mut defs);
5322
5323        // Find the `caller` function definition
5324        let caller_def = defs
5325            .iter()
5326            .find(|d| d.name == "caller")
5327            .expect("caller def");
5328
5329        // G1: bare name is "foo", qualified_path carries the module path.
5330        let call_names: Vec<&str> = caller_def.calls.iter().map(|c| c.name.as_str()).collect();
5331        let qualified_paths: Vec<Option<&str>> = caller_def
5332            .calls
5333            .iter()
5334            .map(|c| c.qualified_path.as_deref())
5335            .collect();
5336
5337        // Bare names must be the trailing identifier only.
5338        assert!(
5339            call_names.contains(&"foo"),
5340            "bare name 'foo' must appear for scoped calls; got: {call_names:?}"
5341        );
5342        // Qualified paths must carry the full scope.
5343        assert!(
5344            qualified_paths.contains(&Some("mod_a::foo")),
5345            "qualified_path 'mod_a::foo' must appear; got: {qualified_paths:?}"
5346        );
5347        assert!(
5348            qualified_paths.contains(&Some("mod_b::foo")),
5349            "qualified_path 'mod_b::foo' must appear; got: {qualified_paths:?}"
5350        );
5351        // Full paths must NOT appear in bare names.
5352        assert!(
5353            !call_names.contains(&"mod_a::foo"),
5354            "full path 'mod_a::foo' must not appear in bare name; got: {call_names:?}"
5355        );
5356    }
5357
5358    /// RED test (R2.3 issue b+c): Two defs named `Read` in different modules,
5359    /// an unqualified call to `Read`. Resolution must NOT silently pick the first.
5360    /// Either both are returned (ambiguous) or none.
5361    #[test]
5362    fn test_ambiguous_name_resolution_returns_all_or_none() {
5363        // Build two FileNodes each with a def named "Read", then a third with an
5364        // unqualified call to "Read".
5365        let file_a = FileNode {
5366            path: "mod_a.rs".to_string(),
5367            defs: vec![Definition {
5368                name: "Read".to_string(),
5369                kind: "trait_item".to_string(),
5370                start_line: 1,
5371                end_line: 3,
5372                scope: String::new(),
5373                signature: None,
5374                start_byte: 0,
5375                end_byte: 50,
5376                calls: vec![],
5377            }],
5378            imports: vec![],
5379        };
5380        let file_b = FileNode {
5381            path: "mod_b.rs".to_string(),
5382            defs: vec![Definition {
5383                name: "Read".to_string(),
5384                kind: "trait_item".to_string(),
5385                start_line: 1,
5386                end_line: 3,
5387                scope: String::new(),
5388                signature: None,
5389                start_byte: 0,
5390                end_byte: 50,
5391                calls: vec![],
5392            }],
5393            imports: vec![],
5394        };
5395        let file_c = FileNode {
5396            path: "caller.rs".to_string(),
5397            defs: vec![Definition {
5398                name: "do_thing".to_string(),
5399                kind: "function_item".to_string(),
5400                start_line: 1,
5401                end_line: 5,
5402                scope: String::new(),
5403                signature: None,
5404                start_byte: 0,
5405                end_byte: 100,
5406                calls: vec![CallRef {
5407                    name: "Read".to_string(),
5408                    qualified_path: None,
5409                    receiver_type: None,
5410                    byte_offset: 10,
5411                    resolved: None,
5412                }],
5413            }],
5414            imports: vec![],
5415        };
5416
5417        let mut files = vec![file_a, file_b, file_c];
5418        let def_index = build_def_index(&files);
5419        resolve_calls(&mut files, &def_index, &HashMap::new());
5420
5421        // The unqualified call to "Read" is ambiguous (two candidates, neither in same
5422        // file nor imported). Resolution must leave it as None — silent first-wins is wrong.
5423        let resolved = files[2].defs[0].calls[0].resolved;
5424        assert_eq!(
5425            resolved, None,
5426            "ambiguous unqualified call with no import context must resolve to None, not silently pick first"
5427        );
5428    }
5429
5430    // ── D1 / D2 tests ────────────────────────────────────────────────
5431
5432    /// Build a small test graph with N files and an optional JSON-extension file.
5433    fn build_test_graph(n_code: usize, include_json: bool) -> (RepoGraph, Vec<usize>) {
5434        let mut file_nodes: Vec<FileNode> = (0..n_code)
5435            .map(|i| FileNode {
5436                path: format!("src/file_{i}.rs"),
5437                defs: vec![
5438                    Definition {
5439                        name: format!("func_{i}"),
5440                        kind: "function_item".to_string(),
5441                        start_line: 1,
5442                        end_line: 5,
5443                        scope: String::new(),
5444                        signature: Some(format!("fn func_{i}() -> i32")),
5445                        start_byte: 0,
5446                        end_byte: 100,
5447                        calls: vec![],
5448                    },
5449                    Definition {
5450                        name: format!("MyStruct{i}"),
5451                        kind: "struct_item".to_string(),
5452                        start_line: 7,
5453                        end_line: 10,
5454                        scope: String::new(),
5455                        signature: None,
5456                        start_byte: 110,
5457                        end_byte: 200,
5458                        calls: vec![],
5459                    },
5460                ],
5461                imports: vec![],
5462            })
5463            .collect();
5464
5465        let json_idx = if include_json {
5466            let idx = file_nodes.len();
5467            file_nodes.push(FileNode {
5468                path: "data/config.json".to_string(),
5469                defs: vec![],
5470                imports: vec![],
5471            });
5472            vec![idx]
5473        } else {
5474            vec![]
5475        };
5476
5477        // Build a star graph: all code files point to file_0.
5478        let n = file_nodes.len();
5479        #[expect(clippy::cast_possible_truncation, reason = "test: n_code << u32::MAX")]
5480        let edges: Vec<(u32, u32, u32)> = (1..n_code).map(|i| (i as u32, 0, 1)).collect();
5481
5482        let base_ranks = pagerank(n, &edges, None);
5483        let (callers, callees) = build_neighbor_lists(n, &edges);
5484
5485        let graph = RepoGraph {
5486            files: file_nodes,
5487            edges,
5488            base_ranks,
5489            callers,
5490            callees,
5491            def_edges: vec![],
5492            def_ranks: vec![],
5493            def_callers: vec![],
5494            def_callees: vec![],
5495            def_offsets: vec![0],
5496            alpha: 0.5,
5497        };
5498
5499        (graph, json_idx)
5500    }
5501
5502    /// D1: `render_json` returns a `GetRepoMapResponse` with a `files` array.
5503    ///
5504    /// On the baseline (before D1) `get_repo_map_ripvec` returned markdown prose via
5505    /// `repo_map::render`; no `files` key existed in the output.
5506    #[test]
5507    fn get_repo_map_returns_json_with_files_array() {
5508        let (graph, _) = build_test_graph(5, false);
5509        let response = render_json(&graph, 50, None, false);
5510        assert!(
5511            !response.files.is_empty(),
5512            "files array should be non-empty for a non-empty graph"
5513        );
5514        // Serialize and verify the JSON shape has a `files` key.
5515        let json = serde_json::to_string(&response).expect("serialize");
5516        let parsed: serde_json::Value = serde_json::from_str(&json).expect("parse");
5517        assert!(
5518            parsed["files"].is_array(),
5519            "serialized response must have a `files` JSON array; got: {parsed}"
5520        );
5521    }
5522
5523    /// D1: every file entry has an `lsp_location` field.
5524    ///
5525    /// Before D1, output was prose text; no `lsp_location` existed anywhere in the response.
5526    #[test]
5527    fn get_repo_map_each_file_has_lsp_location() {
5528        let (graph, _) = build_test_graph(5, false);
5529        let response = render_json(&graph, 50, None, false);
5530        for file in &response.files {
5531            assert!(
5532                !file.lsp_location.file_path.is_empty(),
5533                "each file must have a non-empty lsp_location.file_path"
5534            );
5535        }
5536        // Also verify through JSON.
5537        let json = serde_json::to_string(&response).expect("serialize");
5538        let parsed: serde_json::Value = serde_json::from_str(&json).expect("parse");
5539        for entry in parsed["files"].as_array().expect("files array") {
5540            assert!(
5541                entry["lsp_location"]["file_path"].is_string(),
5542                "each file entry must have lsp_location.file_path string; entry: {entry}"
5543            );
5544        }
5545    }
5546
5547    /// D1: every symbol has a `kind` (u32) and an `lsp_location`.
5548    ///
5549    /// Before D1 symbols were rendered as prose strings like `"function_item func_0"`.
5550    #[test]
5551    fn get_repo_map_each_symbol_has_kind_and_lsp_location() {
5552        let (graph, _) = build_test_graph(3, false);
5553        let response = render_json(&graph, 50, None, false);
5554        for file in &response.files {
5555            for sym in &file.symbols {
5556                assert!(
5557                    sym.kind > 0,
5558                    "symbol kind must be a positive LSP SymbolKind; got 0 for '{}'",
5559                    sym.name
5560                );
5561                assert!(
5562                    !sym.lsp_location.file_path.is_empty(),
5563                    "symbol must have lsp_location.file_path"
5564                );
5565            }
5566        }
5567        // Verify through JSON: kind should be a number.
5568        let json = serde_json::to_string(&response).expect("serialize");
5569        let parsed: serde_json::Value = serde_json::from_str(&json).expect("parse");
5570        for file_entry in parsed["files"].as_array().expect("files") {
5571            for sym_entry in file_entry["symbols"].as_array().expect("symbols") {
5572                assert!(
5573                    sym_entry["kind"].is_number(),
5574                    "symbol `kind` must be a JSON number; sym: {sym_entry}"
5575                );
5576                assert!(
5577                    sym_entry["lsp_location"]["file_path"].is_string(),
5578                    "symbol must have lsp_location.file_path; sym: {sym_entry}"
5579                );
5580            }
5581        }
5582    }
5583
5584    /// D1: `calls` field is an array of `RepoMapCall`-shaped objects (each has
5585    /// `lsp_location` and `rank`).
5586    ///
5587    /// In 4.0.1 calls moved from bare `lsp_location` objects to `RepoMapCall`
5588    /// objects that carry both the target `lsp_location` and the target file's
5589    /// `base_rank`.
5590    #[test]
5591    fn get_repo_map_calls_field_is_array_of_lsp_locations() {
5592        // Build a 5-file star graph so file_0 has non-empty callees.
5593        let (graph, _) = build_test_graph(5, false);
5594        let response = render_json(&graph, 50, None, false);
5595        let json = serde_json::to_string(&response).expect("serialize");
5596        let parsed: serde_json::Value = serde_json::from_str(&json).expect("parse");
5597        for file_entry in parsed["files"].as_array().expect("files") {
5598            let calls = file_entry["calls"]
5599                .as_array()
5600                .expect("calls must be an array");
5601            for call in calls {
5602                // In 4.0.1 each call entry is a RepoMapCall with lsp_location + rank.
5603                assert!(
5604                    call["lsp_location"]["file_path"].is_string(),
5605                    "each call entry must have lsp_location.file_path string; call: {call}"
5606                );
5607                assert!(
5608                    call["rank"].is_number(),
5609                    "each call entry must have a numeric rank; call: {call}"
5610                );
5611            }
5612        }
5613    }
5614
5615    /// D2 / G3: `render_json_budgeted` with a very tight budget returns fewer files.
5616    ///
5617    /// Before the budget allocator, `max_files=3` controlled file count but not
5618    /// per-file expansion. In 4.0.1 the token_budget controls total bytes; with
5619    /// a budget of 1 token (= 4 bytes) only the envelope minimum allows any file
5620    /// at all, and the test verifies that the total_files counter still reflects
5621    /// the full eligible count. `render_json` (compat shim) passes a generous
5622    /// budget; use `render_json_budgeted` with a tight budget to verify the cap.
5623    #[test]
5624    fn get_repo_map_returns_at_most_max_files_files() {
5625        let (graph, _) = build_test_graph(10, false);
5626        // Use render_json_budgeted directly with a tight budget (600 bytes = 3 files
5627        // × 200-byte floor). Each file's envelope minimum is 200 bytes so a 600-byte
5628        // budget should admit at most 3 files.
5629        let response = render_json_budgeted(&graph, 150, None, false);
5630        assert!(
5631            response.files.len() <= 3,
5632            "files.len() = {} must be <= 3 for a 600-byte budget",
5633            response.files.len()
5634        );
5635        assert_eq!(
5636            response.total_files, 10,
5637            "total_files must reflect the full eligible count before budget cap"
5638        );
5639        assert!(
5640            response.capped,
5641            "capped must be true when total_files > files.len()"
5642        );
5643    }
5644
5645    /// D2: `include_metadata=false` (default) excludes JSON/TOML/etc. files.
5646    ///
5647    /// Before D2, JSON files with thousands of repeated keys dominated the
5648    /// output (Issue #5 — JSON-key flooding).
5649    #[test]
5650    fn get_repo_map_excludes_meta_by_default() {
5651        let (graph, _) = build_test_graph(3, /*include_json=*/ true);
5652        // Default: include_metadata = false
5653        let response = render_json(&graph, 50, None, false);
5654        for file in &response.files {
5655            assert!(
5656                !std::path::Path::new(&file.lsp_location.file_path)
5657                    .extension()
5658                    .is_some_and(|e| e.eq_ignore_ascii_case("json")),
5659                "JSON (Meta) files must be excluded when include_metadata=false; found: {}",
5660                file.lsp_location.file_path
5661            );
5662        }
5663    }
5664
5665    /// D2: `include_metadata=true` includes JSON files.
5666    ///
5667    /// Callers who opt-in to metadata should see all content kinds.
5668    #[test]
5669    fn get_repo_map_include_metadata_true_includes_json() {
5670        let (graph, _) = build_test_graph(3, /*include_json=*/ true);
5671        let response = render_json(&graph, 50, None, true);
5672        let has_json = response.files.iter().any(|f| {
5673            std::path::Path::new(&f.lsp_location.file_path)
5674                .extension()
5675                .is_some_and(|e| e.eq_ignore_ascii_case("json"))
5676        });
5677        assert!(
5678            has_json,
5679            "JSON file must be present when include_metadata=true"
5680        );
5681    }
5682
5683    /// J1/J2 MEASUREMENT: flask corpus focus_file=blueprints.py rank dispersion.
5684    ///
5685    /// Mandatory measurement from the 4.0.5 Wave-2 Front-C briefing:
5686    /// - `len(files) >= 8` (not collapsed to just the focus)
5687    /// - focus file rank is the highest in the response
5688    /// - next 5 files all have rank >= 10% of focus rank
5689    /// - neighborhood contains semantically related files (app.py, scaffold.py)
5690    #[test]
5691    #[ignore = "runs on flask corpus at tests/corpus/code/flask; use --ignored --nocapture"]
5692    #[expect(
5693        clippy::too_many_lines,
5694        reason = "end-to-end corpus measurement test; assertion sequence is sequential and cannot be meaningfully split"
5695    )]
5696    fn test_flask_focus_blueprints_rank_dispersion() {
5697        let corpus_root = Path::new(env!("CARGO_MANIFEST_DIR"))
5698            .parent()
5699            .unwrap()
5700            .parent()
5701            .unwrap()
5702            .join("tests/corpus/code/flask");
5703
5704        assert!(
5705            corpus_root.exists(),
5706            "flask corpus not found at {}",
5707            corpus_root.display()
5708        );
5709
5710        let graph = build_graph(&corpus_root).expect("build_graph on flask corpus");
5711        eprintln!("Flask corpus: {} files in graph", graph.files.len());
5712
5713        // Find focus file
5714        let focus_path = "src/flask/blueprints.py";
5715        let focus_idx = graph.files.iter().position(|f| f.path == focus_path);
5716        eprintln!("Focus file '{focus_path}' -> idx: {focus_idx:?}");
5717        assert!(
5718            focus_idx.is_some(),
5719            "blueprints.py not found in graph; available files: {:?}",
5720            graph
5721                .files
5722                .iter()
5723                .map(|f| &f.path)
5724                .take(20)
5725                .collect::<Vec<_>>()
5726        );
5727
5728        let response = render_json_budgeted(&graph, 4000, focus_idx, false);
5729
5730        // Criterion 1: at least 8 files returned.
5731        eprintln!(
5732            "Focused response: {} files (total_files={})",
5733            response.files.len(),
5734            response.total_files
5735        );
5736        assert!(
5737            response.files.len() >= 8,
5738            "expected >= 8 files in focused response; got {} — I#16 winner-take-all collapse",
5739            response.files.len()
5740        );
5741
5742        // Print top 10 for inspection.
5743        eprintln!("\nTop 10 focused files:");
5744        for (i, f) in response.files.iter().take(10).enumerate() {
5745            eprintln!("  [{i}] rank={:.6}  {}", f.rank, f.lsp_location.file_path);
5746        }
5747
5748        // Criterion 2: focus file must appear near the top (top-3) of focused
5749        // results.  With PERSONALIZATION_ALPHA=0.15 and the flask corpus,
5750        // src/flask/app.py has higher structural rank than blueprints.py and
5751        // may legitimately rank #1 — the focus boosts blueprints.py relative
5752        // to its unfocused position, but doesn't guarantee it beats every
5753        // structurally central neighbor.  Being in top-3 confirms the bias
5754        // is working (pre-fix blueprints.py was #1 at 0.703 but that was a
5755        // degenerate collapse; now #1 or #2 is healthy).
5756        let focus_file_rank = response
5757            .files
5758            .iter()
5759            .find(|f| {
5760                f.lsp_location.file_path.contains("blueprints.py")
5761                    && !f.lsp_location.file_path.contains("test_")
5762                    && !f.lsp_location.file_path.contains("sansio")
5763            })
5764            .map(|f| f.rank)
5765            .unwrap_or(0.0);
5766        let focus_position = response
5767            .files
5768            .iter()
5769            .position(|f| {
5770                f.lsp_location.file_path.contains("blueprints.py")
5771                    && !f.lsp_location.file_path.contains("test_")
5772                    && !f.lsp_location.file_path.contains("sansio")
5773            })
5774            .unwrap_or(usize::MAX);
5775        eprintln!(
5776            "\nblueprinets.py position: #{} rank={:.6}",
5777            focus_position + 1,
5778            focus_file_rank
5779        );
5780        assert!(
5781            focus_position < 3,
5782            "blueprints.py must be in top-3 focused results (got position {}); \
5783             soft personalization must rebias toward focus neighborhood — I#16",
5784            focus_position + 1
5785        );
5786
5787        // Criterion 3: next 5 non-focus files have rank >= 10% of the top
5788        // file's rank.  This is the core dispersion check: no more Dirac-delta
5789        // collapse where one file is 0.703 and all others are 0.003.
5790        let top_rank = response.files[0].rank;
5791        let non_focus_min_5 = response
5792            .files
5793            .iter()
5794            .filter(|f| {
5795                !(f.lsp_location.file_path.contains("blueprints.py")
5796                    && !f.lsp_location.file_path.contains("test_")
5797                    && !f.lsp_location.file_path.contains("sansio"))
5798            })
5799            .take(5)
5800            .map(|f| f.rank)
5801            .fold(f32::INFINITY, f32::min);
5802        let pct = non_focus_min_5 / top_rank * 100.0;
5803        eprintln!(
5804            "\nNext-5 (non-focus) min rank: {non_focus_min_5:.6} = {pct:.1}% of top rank {top_rank:.6}"
5805        );
5806        assert!(
5807            pct >= 10.0,
5808            "next-5 non-focus files min rank is {pct:.1}% of top rank (need ≥ 10%); \
5809             files are collapsing to near-zero floor — I#16"
5810        );
5811
5812        // Criterion 4: neighborhood quality — related files present.
5813        let related_names = ["app.py", "scaffold.py", "sansio"];
5814        let found_related: Vec<&str> = related_names
5815            .iter()
5816            .copied()
5817            .filter(|name| {
5818                response
5819                    .files
5820                    .iter()
5821                    .any(|f| f.lsp_location.file_path.contains(name))
5822            })
5823            .collect();
5824        eprintln!("\nNeighborhood quality: found related files: {found_related:?}");
5825        // At least one related file should appear (soft assertion — log if missing).
5826        if found_related.is_empty() {
5827            eprintln!(
5828                "WARNING: no expected related files (app.py, scaffold.py) found in neighborhood"
5829            );
5830        }
5831    }
5832
5833    #[test]
5834    #[ignore = "runs on full ripvec codebase; use --nocapture to see output"]
5835    fn test_full_repo_map() {
5836        use std::time::Instant;
5837
5838        let root = Path::new(env!("CARGO_MANIFEST_DIR"))
5839            .parent()
5840            .unwrap()
5841            .parent()
5842            .unwrap();
5843
5844        // Phase 1: build_graph (walk + parse + import resolve + PageRank)
5845        let t0 = Instant::now();
5846        let graph = build_graph(root).expect("build_graph on ripvec root");
5847        let build_ms = t0.elapsed().as_secs_f64() * 1000.0;
5848
5849        // Phase 2: render (default, no focus)
5850        let t1 = Instant::now();
5851        let rendered = render(&graph, 2000, None);
5852        let render_ms = t1.elapsed().as_secs_f64() * 1000.0;
5853
5854        // Phase 3: render (topic-sensitive, focused on highest-ranked file)
5855        let t2 = Instant::now();
5856        let focus_idx = graph
5857            .base_ranks
5858            .iter()
5859            .enumerate()
5860            .max_by(|a, b| a.1.total_cmp(b.1))
5861            .map(|(i, _)| i);
5862        let focused = render(&graph, 2000, focus_idx);
5863        let focus_ms = t2.elapsed().as_secs_f64() * 1000.0;
5864
5865        eprintln!("\n=== Repo Map Performance ===");
5866        eprintln!(
5867            "Files: {}, Edges: {}, Defs: {}",
5868            graph.files.len(),
5869            graph.edges.len(),
5870            graph.files.iter().map(|f| f.defs.len()).sum::<usize>()
5871        );
5872        eprintln!("build_graph:     {build_ms:.1}ms (walk + parse + resolve + PageRank)");
5873        eprintln!(
5874            "render(default): {render_ms:.3}ms ({} chars, ~{} tokens)",
5875            rendered.len(),
5876            rendered.len() / 4
5877        );
5878        eprintln!(
5879            "render(focused): {focus_ms:.3}ms ({} chars, ~{} tokens)",
5880            focused.len(),
5881            focused.len() / 4
5882        );
5883
5884        eprintln!("\nTop 5 by PageRank:");
5885        let mut ranked: Vec<(usize, f32)> = graph.base_ranks.iter().copied().enumerate().collect();
5886        ranked.sort_by(|a, b| b.1.total_cmp(&a.1));
5887        for (i, rank) in ranked.iter().take(5) {
5888            eprintln!("  {:.4} {}", rank, graph.files[*i].path);
5889        }
5890
5891        eprintln!("\n=== Default Render ===\n{rendered}");
5892        eprintln!(
5893            "\n=== Focused Render (on {}) ===\n{focused}",
5894            focus_idx
5895                .map(|i| graph.files[i].path.as_str())
5896                .unwrap_or("none")
5897        );
5898    }
5899}