Skip to main content

ripvec_core/
repo_map.rs

1//! `PageRank`-weighted structural overview of a codebase.
2//!
3//! Builds a dependency graph from tree-sitter definition and import extraction,
4//! ranks files by importance using `PageRank` (standard or topic-sensitive), and
5//! renders a budget-constrained overview with tiered detail levels.
6
7use std::collections::{HashMap, HashSet};
8use std::fmt::Write as _;
9use std::path::{Path, PathBuf};
10
11use rayon::prelude::*;
12use rkyv::{Archive, Deserialize as RkyvDeserialize, Serialize as RkyvSerialize};
13use streaming_iterator::StreamingIterator;
14use tree_sitter::{Parser, Query, QueryCursor};
15
16use serde::{Deserialize, Serialize};
17
18use crate::chunk::ContentKind;
19use crate::languages;
20use crate::walk;
21
22/// Serialize a `ContentKind` to a lowercase string tag for JSON output.
23fn content_kind_tag(ck: ContentKind) -> &'static str {
24    match ck {
25        ContentKind::Code => "code",
26        ContentKind::Docs => "docs",
27        ContentKind::Meta => "meta",
28    }
29}
30
31// ── Data Structures ──────────────────────────────────────────────────
32
33/// Persisted dependency graph with `PageRank` scores.
34#[derive(Debug, Clone, Archive, RkyvSerialize, RkyvDeserialize)]
35pub struct RepoGraph {
36    /// Files in the repository with definitions, imports, and calls.
37    pub files: Vec<FileNode>,
38    /// File-level edges (derived from def-level call edges).
39    pub edges: Vec<(u32, u32, u32)>,
40    /// File-level `PageRank` scores (aggregated from def-level).
41    pub base_ranks: Vec<f32>,
42    /// File-level callers (indices into `files`).
43    pub callers: Vec<Vec<u32>>,
44    /// File-level callees (indices into `files`).
45    pub callees: Vec<Vec<u32>>,
46    /// Definition-level call edges: `(caller_def, callee_def, weight)`.
47    pub def_edges: Vec<(DefId, DefId, u32)>,
48    /// Definition-level `PageRank` scores (flattened: `offsets[file_idx] + def_idx`).
49    pub def_ranks: Vec<f32>,
50    /// Definition-level callers (flattened, parallel to `def_ranks`).
51    pub def_callers: Vec<Vec<DefId>>,
52    /// Definition-level callees (flattened, parallel to `def_ranks`).
53    pub def_callees: Vec<Vec<DefId>>,
54    /// Prefix-sum offsets for flattening `DefId` to linear index.
55    pub def_offsets: Vec<usize>,
56    /// Auto-tuned alpha for search boost.
57    pub alpha: f32,
58}
59
60/// A file in the repository with its definitions and imports.
61#[derive(Debug, Clone, Archive, RkyvSerialize, RkyvDeserialize)]
62pub struct FileNode {
63    /// Relative path from the repository root.
64    pub path: String,
65    /// Definitions (functions, structs, classes, etc.) extracted from this file.
66    pub defs: Vec<Definition>,
67    /// Import references extracted from this file.
68    pub imports: Vec<ImportRef>,
69}
70
71/// A definition extracted from a source file.
72#[derive(Debug, Clone, Default, Archive, RkyvSerialize, RkyvDeserialize)]
73pub struct Definition {
74    /// Name of the definition (e.g., function name, class name).
75    pub name: String,
76    /// Kind of syntax node (e.g., `function_item`, `class_definition`).
77    pub kind: String,
78    /// 1-based start line number.
79    pub start_line: u32,
80    /// 1-based end line number.
81    pub end_line: u32,
82    /// Scope chain (e.g., `"impl_item Foo > fn bar"`).
83    pub scope: String,
84    /// Function/method signature, if available.
85    pub signature: Option<String>,
86    /// Byte offset of this definition's start in the source file.
87    pub start_byte: u32,
88    /// Byte offset of this definition's end in the source file.
89    pub end_byte: u32,
90    /// Call sites within this definition's body.
91    pub calls: Vec<CallRef>,
92    /// The first decorator name for Python `decorated_definition` nodes (e.g.,
93    /// `"property"`, `"classmethod"`, `"staticmethod"`, `"cached_property"`).
94    ///
95    /// `None` for all non-Python definitions and for bare (undecorated) Python
96    /// functions and classes. Populated by `extract_definitions` at AST-parse
97    /// time with full tree-sitter access (C1, 4.1.1).
98    pub decorator: Option<String>,
99    /// Decorator-aware LSP SymbolKind integer (e.g., 7=Property, 12=Function).
100    ///
101    /// Computed at parse time when the AST is available so projection sites
102    /// (`render_json_budgeted`) do not need to re-parse. For Python `@property`
103    /// or `@cached_property` → 7. For `@classmethod`, `@staticmethod`, or any
104    /// other decorator → 12. `None` for all non-decorated definitions; callers
105    /// fall back to `lsp_symbol_kind_for_node_kind(&self.kind)` when `None`
106    /// (C1/C2, 4.1.1).
107    pub lsp_kind_hint: Option<u32>,
108}
109
110/// An import reference extracted from a source file.
111#[derive(Debug, Clone, Archive, RkyvSerialize, RkyvDeserialize)]
112pub struct ImportRef {
113    /// Raw import path as written in source (e.g., `crate::foo::bar`).
114    pub raw_path: String,
115    /// Resolved file index in [`RepoGraph::files`], if resolution succeeded.
116    pub resolved_idx: Option<u32>,
117}
118
119/// Unique identifier for a definition: (file index, definition index within file).
120pub type DefId = (u32, u16);
121
122/// A call site extracted from a definition body.
123#[derive(Debug, Clone, Default, Archive, RkyvSerialize, RkyvDeserialize)]
124pub struct CallRef {
125    /// Callee function/method name (bare, without qualifier).
126    ///
127    /// For scoped calls like `mod_a::foo()`, this is `"foo"`.
128    /// For bare calls like `foo()`, this is `"foo"`.
129    pub name: String,
130    /// Full qualified path for scoped calls, e.g. `Some("mod_a::foo")`.
131    ///
132    /// `None` for bare (unqualified) calls. When `Some`, `resolve_calls`
133    /// uses this for qualifier-based module disambiguation before falling
134    /// back to the bare `name`.
135    pub qualified_path: Option<String>,
136    /// Receiver type for method calls, inferred from local context.
137    ///
138    /// Set to `Some("Foo")` when:
139    /// - The call is `self.method()` inside `impl Foo { … }`.
140    /// - The call is `x.method()` where `x` has an explicit type annotation `x: Foo`.
141    /// - The call is `x.method()` after `let x = Foo::new()`.
142    ///
143    /// `None` for free function calls, or when the receiver type cannot be
144    /// inferred from local context alone. When `Some`, `resolve_calls` prefers
145    /// defs whose enclosing impl scope matches the receiver type.
146    pub receiver_type: Option<String>,
147    /// Byte offset of the call in the source file (for scoping to definitions).
148    pub byte_offset: u32,
149    /// Resolved target definition, if resolution succeeded.
150    pub resolved: Option<DefId>,
151}
152
153// ── JSON output types ────────────────────────────────────────────────
154
155/// LSP-shaped location pointing at a file or symbol within a file.
156///
157/// Lines and characters are 0-based, matching the Language Server Protocol
158/// convention so callers can pass this directly to LSP tools without any
159/// conversion.
160#[derive(Debug, Clone, Serialize)]
161pub struct RepoMapLspLocation {
162    /// Relative path from the repository root (prefixed with `./`).
163    pub file_path: String,
164    /// 0-based start line.
165    pub start_line: usize,
166    /// 0-based start character (0 for file-level locations).
167    pub start_character: usize,
168    /// 0-based end line (equals `start_line` for file-level locations).
169    pub end_line: usize,
170    /// 0-based end character (0 for file-level locations).
171    pub end_character: usize,
172}
173
174/// A top-level symbol extracted from a file in the repository map.
175///
176/// Analogous to an LSP `DocumentSymbol` but limited to the fields available
177/// from tree-sitter definition extraction. The `rank` field carries the
178/// definition-level `PageRank` score from [`RepoGraph::def_ranks`], enabling
179/// callers to prioritise symbols by structural importance.
180#[derive(Debug, Clone, Serialize)]
181pub struct RepoMapSymbol {
182    /// Symbol name (function name, struct name, etc.).
183    pub name: String,
184    /// LSP `SymbolKind` as a decimal — use the same values as
185    /// `lsp_workspace_symbols` and `lsp_document_symbols`.
186    pub kind: u32,
187    /// Location pointing at the symbol's definition line (0-based).
188    pub lsp_location: RepoMapLspLocation,
189    /// Definition-level `PageRank` score from [`RepoGraph::def_ranks`].
190    ///
191    /// Higher values indicate definitions that are called by many other
192    /// definitions. Used by the token-budget allocator to decide which
193    /// symbols to include when the per-file budget is constrained.
194    pub rank: f32,
195}
196
197/// An outgoing call-edge from a file to another file.
198///
199/// Carries both the target file's `lsp_location` and its `base_rank`
200/// (file-level `PageRank` score) so callers can decide how important
201/// each dependency is without a separate lookup.
202#[derive(Debug, Clone, Serialize)]
203pub struct RepoMapCall {
204    /// Location pointing at the target file (line 0, character 0).
205    pub lsp_location: RepoMapLspLocation,
206    /// File-level `PageRank` score of the target file.
207    pub rank: f32,
208}
209
210/// One file entry in the JSON repo map.
211///
212/// Carries the file's `PageRank` score, content kind, outgoing call-edges to
213/// other files, and the file's top-level symbol definitions — all with
214/// `lsp_location` so the caller can chain directly into LSP tools without
215/// any destructuring.
216#[derive(Debug, Clone, Serialize)]
217pub struct RepoMapFile {
218    /// Location pointing at the file itself (line 0, character 0).
219    ///
220    /// Pass `lsp_location.file_path` directly into `lsp_document_symbols` or
221    /// any other file-scoped tool.
222    pub lsp_location: RepoMapLspLocation,
223    /// `PageRank` score in [0, 1] (higher = more structurally central).
224    pub rank: f32,
225    /// Content classification: `"code"`, `"docs"`, or `"meta"`.
226    ///
227    /// Serialized as a lowercase string tag so JSON consumers can branch
228    /// without numeric magic values. Mirrors the `ContentKind` enum in
229    /// `ripvec-core::chunk`.
230    pub content_kind: &'static str,
231    /// Outgoing call-edges sorted by target file `PageRank` descending.
232    pub calls: Vec<RepoMapCall>,
233    /// Top-level definitions extracted from this file by tree-sitter,
234    /// sorted by definition-level `PageRank` descending and pruned to
235    /// the per-file token-budget allocation.
236    pub symbols: Vec<RepoMapSymbol>,
237    /// Number of symbols that were omitted due to budget exhaustion or
238    /// logarithmic attenuation cutoff. `truncated_symbols + symbols.len()`
239    /// equals the total definition count for the file.
240    pub truncated_symbols: usize,
241    /// Number of call-edges that were omitted due to the `MAX_FILE_CALLS`
242    /// render cap or byte-budget exhaustion. `truncated_calls + calls.len()`
243    /// equals the total callee count for the file (I#68, 4.1.4).
244    pub truncated_calls: usize,
245}
246
247/// JSON-mode response envelope for `get_repo_map` (4.0.1 shape).
248///
249/// Replaces the `max_files`-capped shape from 4.0.0. The caller supplies a
250/// `token_budget`; files are allocated bytes proportional to their `PageRank`
251/// (40% cap per file, 200-byte envelope floor). Symbols are filled in
252/// def-rank order with a logarithmic attenuation cutoff. Leftover bytes
253/// cascade to subsequent files.
254///
255/// The `estimated_bytes`, `budget_bytes`, and `budget_exhausted` fields give
256/// callers real-time feedback on how tightly the budget was consumed.
257#[derive(Debug, Clone, Serialize)]
258pub struct GetRepoMapResponse {
259    /// Files sorted by `PageRank` descending, pruned to the token budget.
260    pub files: Vec<RepoMapFile>,
261    /// Total number of eligible files in the graph (pre-allocation).
262    ///
263    /// If `total_files > files.len()`, the budget ran out before all files
264    /// could be included. Read `budget_exhausted` directly for the boolean.
265    pub total_files: usize,
266    /// Actual serialised-JSON byte count for all returned content.
267    pub estimated_bytes: usize,
268    /// Budget ceiling in bytes that was used for allocation
269    /// (`token_budget * 4`).
270    pub budget_bytes: usize,
271    /// `true` when `total_files > files.len()` (budget was exhausted before
272    /// all eligible files were included).
273    pub budget_exhausted: bool,
274    /// Retained for backward compatibility with 4.0.0 callers that checked
275    /// `capped`. Equivalent to `budget_exhausted`.
276    pub capped: bool,
277}
278
279// ── Constants ────────────────────────────────────────────────────────
280
281/// `PageRank` damping factor.
282const DAMPING: f32 = 0.85;
283
284/// `PageRank` convergence threshold.
285const EPSILON: f32 = 1e-6;
286
287/// Maximum `PageRank` iterations.
288const MAX_ITERATIONS: usize = 100;
289
290/// Maximum callers/callees stored per file (display-oriented neighbor lists).
291///
292/// Raised from 5 → 25 in 4.1.3 (I#60): hub functions in real Rust/Python/Go
293/// corpora commonly have 10-25 outgoing call edges; the old cap of 5 caused
294/// every large-scale corpus file to report `truncated_calls ≥ 2` and left
295/// agents reading `get_repo_map.files[i].calls[]` with a sparse skeleton.
296///
297/// The serialisation-size cost is bounded by the existing `token_budget`
298/// allocator, which gates each file's call list against its byte allocation.
299/// BFS-internal reachability (`compute_dead_code`) uses the untruncated
300/// `def_edges` CSR directly (I#57/I#61) and is NOT affected by this constant.
301const MAX_NEIGHBORS: usize = 25;
302
303/// Maximum number of outgoing call entries rendered per file in the JSON response.
304///
305/// Applied at render time in [`render_json_budgeted`] as a hard cap on
306/// `calls[]` length (I#68, 4.1.4). Symmetric with the graph-build cap
307/// [`MAX_NEIGHBORS`] so agents always see up to 25 callees — matching the
308/// def-level cap introduced in I#60.
309///
310/// Replaces the logarithmic attenuation cutoff that was previously applied to
311/// file-level callees. Attenuation is appropriate for *symbol* lists (where
312/// rank distributions are informative) but pathological for *call-edge* lists:
313/// in real corpora callee base-ranks follow a geometric distribution, causing
314/// the attenuation to fire at pos=1 and collapse `calls[]` to a single entry.
315/// The byte-budget check is retained; this constant adds a count ceiling.
316const MAX_FILE_CALLS: usize = 25;
317
318/// Approximate characters per token for budget estimation.
319const CHARS_PER_TOKEN: usize = 4;
320
321/// Concentration mass placed on the focus node in topic-sensitive `PageRank`.
322///
323/// Following Haveliwala 2002 ("Topic-Sensitive PageRank"), the personalization
324/// vector places a bias `α` on the focus node and distributes the remaining
325/// `1 - α` uniformly over all other nodes. This preserves rank dispersion
326/// across the corpus — the user sees a *neighborhood* of related files
327/// rebiased toward the focus, not a Dirac delta on the focus node with
328/// every other file collapsed to an equally negligible uniform floor.
329///
330/// Value 0.35 means:
331///   - focus node teleportation probability = 0.35
332///   - each of the (n - 1) other nodes = 0.65 / (n - 1)
333///
334/// Calibration history:
335/// - Pre-4.0.5: α = 0.70 → winner-take-all collapse (flask focus = 0.703,
336///   all others ≈ 0.003); fixed under I#16.
337/// - 4.0.5 → 4.1.11: α = 0.15 → preserved dispersion but underbiased on
338///   real corpora: flask focus blueprints.py landed at position #5-#7
339///   instead of top-3 because structural hubs (helpers.py, app.py)
340///   dominated.
341/// - 4.1.12+: α = 0.35 → focus reliably surfaces in top-3 on flask
342///   (rank ~0.10 vs hub rank ~0.10), small-graph dispersion tests still
343///   pass (n=10 star dispersion ratio remains under the 40× ceiling),
344///   J2 file-count parity relaxed from 80% to 70% as the unavoidable
345///   trade-off for stronger focus bias.
346const PERSONALIZATION_ALPHA: f32 = 0.35;
347
348// ── Import Queries ───────────────────────────────────────────────────
349
350/// Compile a tree-sitter import query for the given extension.
351///
352/// Returns `None` for unsupported extensions.
353fn import_query_for_extension(ext: &str) -> Option<(tree_sitter::Language, Query)> {
354    let (lang, query_str): (tree_sitter::Language, &str) = match ext {
355        "rs" => (
356            tree_sitter_rust::LANGUAGE.into(),
357            "(use_declaration) @import",
358        ),
359        "py" | "pyi" => (
360            tree_sitter_python::LANGUAGE.into(),
361            concat!(
362                "(import_statement) @import\n",
363                "(import_from_statement) @import",
364            ),
365        ),
366        "js" | "jsx" => (
367            tree_sitter_javascript::LANGUAGE.into(),
368            "(import_statement source: (string) @import_path) @import",
369        ),
370        "ts" => (
371            tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into(),
372            "(import_statement source: (string) @import_path) @import",
373        ),
374        "tsx" => (
375            tree_sitter_typescript::LANGUAGE_TSX.into(),
376            "(import_statement source: (string) @import_path) @import",
377        ),
378        "go" => (
379            tree_sitter_go::LANGUAGE.into(),
380            "(import_spec path: (interpreted_string_literal) @import_path) @import",
381        ),
382        // Ruby: require statements.
383        "rb" => (
384            tree_sitter_ruby::LANGUAGE.into(),
385            "(call method: (identifier) @_method arguments: (argument_list (string (string_content) @import_path)) (#eq? @_method \"require\")) @import",
386        ),
387        _ => return None,
388    };
389    let query = match Query::new(&lang, query_str) {
390        Ok(q) => q,
391        Err(e) => {
392            tracing::warn!(ext, %e, "import query compilation failed — language may be ABI-incompatible");
393            return None;
394        }
395    };
396    Some((lang, query))
397}
398
399/// Extract import paths from source using tree-sitter.
400fn extract_imports(
401    source: &str,
402    lang: &tree_sitter::Language,
403    import_query: &Query,
404) -> Vec<String> {
405    let mut parser = Parser::new();
406    if parser.set_language(lang).is_err() {
407        return vec![];
408    }
409    let Some(tree) = parser.parse(source, None) else {
410        return vec![];
411    };
412
413    let mut cursor = QueryCursor::new();
414    let mut imports = Vec::new();
415    let mut matches = cursor.matches(import_query, tree.root_node(), source.as_bytes());
416
417    while let Some(m) = matches.next() {
418        // Prefer @import_path capture (JS/TS/Go), fall back to full @import text
419        let mut import_path_text = None;
420        let mut import_text = None;
421
422        for cap in m.captures {
423            let cap_name = &import_query.capture_names()[cap.index as usize];
424            let text = &source[cap.node.start_byte()..cap.node.end_byte()];
425            if *cap_name == "import_path" {
426                import_path_text = Some(text.trim_matches(|c| c == '"' || c == '\''));
427            } else if *cap_name == "import" {
428                import_text = Some(text);
429            }
430        }
431
432        if let Some(path) = import_path_text {
433            imports.push(path.to_string());
434        } else if let Some(text) = import_text {
435            imports.push(text.to_string());
436        }
437    }
438
439    imports
440}
441
442// ── Import Resolution ────────────────────────────────────────────────
443
444/// Resolve a Rust `use` path to a file index in the file map.
445///
446/// Handles `crate::`, `self::`, and `super::` prefixes. External crate
447/// imports are dropped (returns `None`).
448fn resolve_rust_import(
449    raw: &str,
450    file_path: &Path,
451    root: &Path,
452    file_index: &HashMap<PathBuf, usize>,
453) -> Option<usize> {
454    // Extract the module path from `use crate::foo::bar;` or `use crate::foo::bar::Baz;`
455    let trimmed = raw
456        .trim()
457        .trim_start_matches("use ")
458        .trim_end_matches(';')
459        .trim();
460
461    let segments: Vec<&str> = trimmed.split("::").collect();
462    if segments.is_empty() {
463        return None;
464    }
465
466    // Determine the base directory and skip prefix segments
467    let (base, skip) = match segments[0] {
468        "crate" => {
469            // Find the nearest Cargo.toml ancestor to determine the crate root.
470            // In a workspace, `crate::foo` resolves relative to the crate's src/,
471            // not the workspace root.
472            let mut dir = file_path.parent();
473            let crate_root = loop {
474                match dir {
475                    Some(d) if d.join("Cargo.toml").exists() => break d.join("src"),
476                    Some(d) => dir = d.parent(),
477                    None => break root.join("src"), // fallback
478                }
479            };
480            (crate_root, 1)
481        }
482        "self" => {
483            let dir = file_path.parent()?;
484            (dir.to_path_buf(), 1)
485        }
486        "super" => {
487            let dir = file_path.parent()?.parent()?;
488            (dir.to_path_buf(), 1)
489        }
490        // External crate — drop
491        _ => return None,
492    };
493
494    // Build candidate paths from the remaining segments.
495    // Try progressively shorter prefixes since the last segments
496    // may be items (struct, fn) rather than modules.
497    let path_segments = &segments[skip..];
498    for end in (1..=path_segments.len()).rev() {
499        let mut candidate = base.clone();
500        for seg in &path_segments[..end] {
501            // Strip glob patterns like `{Foo, Bar}`
502            let clean = seg.split('{').next().unwrap_or(seg).trim();
503            if !clean.is_empty() {
504                candidate.push(clean);
505            }
506        }
507
508        // Try file.rs
509        let as_file = candidate.with_extension("rs");
510        if let Some(&idx) = file_index.get(&as_file) {
511            return Some(idx);
512        }
513
514        // Try dir/mod.rs
515        let as_mod = candidate.join("mod.rs");
516        if let Some(&idx) = file_index.get(&as_mod) {
517            return Some(idx);
518        }
519    }
520
521    None
522}
523
524/// Resolve an import path to a file index based on file extension.
525fn resolve_import(
526    raw: &str,
527    ext: &str,
528    file_path: &Path,
529    root: &Path,
530    file_index: &HashMap<PathBuf, usize>,
531) -> Option<usize> {
532    match ext {
533        "rs" => resolve_rust_import(raw, file_path, root, file_index),
534        "py" | "pyi" => resolve_python_import(raw, root, file_index),
535        "js" | "jsx" | "ts" | "tsx" => resolve_js_import(raw, file_path, file_index),
536        // Go imports use full package paths — skip local resolution
537        _ => None,
538    }
539}
540
541/// Resolve a Python import to a file index.
542///
543/// Handles `import foo.bar` and `from foo.bar import baz` patterns.
544fn resolve_python_import(
545    raw: &str,
546    root: &Path,
547    file_index: &HashMap<PathBuf, usize>,
548) -> Option<usize> {
549    let module_path = if let Some(rest) = raw.strip_prefix("from ") {
550        rest.split_whitespace().next()?
551    } else if let Some(rest) = raw.strip_prefix("import ") {
552        rest.split_whitespace().next()?
553    } else {
554        return None;
555    };
556
557    let rel_path: PathBuf = module_path.split('.').collect();
558    for ext in ["py", "pyi"] {
559        let as_file = root.join(&rel_path).with_extension(ext);
560        if let Some(&idx) = file_index.get(&as_file) {
561            return Some(idx);
562        }
563    }
564
565    for init_name in ["__init__.py", "__init__.pyi"] {
566        let as_init = root.join(&rel_path).join(init_name);
567        if let Some(&idx) = file_index.get(&as_init) {
568            return Some(idx);
569        }
570    }
571
572    None
573}
574
575/// Resolve a JS/TS import to a file index.
576///
577/// Handles relative paths like `./foo` or `../bar`.
578fn resolve_js_import(
579    raw: &str,
580    file_path: &Path,
581    file_index: &HashMap<PathBuf, usize>,
582) -> Option<usize> {
583    if !raw.starts_with('.') {
584        return None;
585    }
586
587    let dir = file_path.parent()?;
588    let candidate = dir.join(raw);
589
590    for ext in &["js", "jsx", "ts", "tsx"] {
591        let with_ext = candidate.with_extension(ext);
592        if let Some(&idx) = file_index.get(&with_ext) {
593            return Some(idx);
594        }
595    }
596
597    for ext in &["js", "jsx", "ts", "tsx"] {
598        let index_file = candidate.join("index").with_extension(ext);
599        if let Some(&idx) = file_index.get(&index_file) {
600            return Some(idx);
601        }
602    }
603
604    None
605}
606
607// ── Extraction ───────────────────────────────────────────────────────
608
609/// Extract the name of the first decorator from a `decorated_definition` node.
610///
611/// Mirrors the private `languages::first_decorator_ident` but inlined here
612/// because that function is private to the `languages` module.
613///
614/// For simple `@name` decorators (e.g., `@property`, `@classmethod`), returns
615/// `Some("property")` or `Some("classmethod")`.
616///
617/// For attribute-access decorators (e.g., `@functools.lru_cache`), returns
618/// `Some("functools.lru_cache")` — the full dotted-name text.
619///
620/// For call-expression decorators (e.g., `@app.route("/")`), returns `None`.
621fn extract_first_decorator_name(node: &tree_sitter::Node<'_>, source: &[u8]) -> Option<String> {
622    let mut cursor = node.walk();
623    for child in node.children(&mut cursor) {
624        if child.kind() == "decorator" {
625            let mut inner = child.walk();
626            for inner_child in child.children(&mut inner) {
627                match inner_child.kind() {
628                    // Simple name (@property) or attribute access (@functools.lru_cache) —
629                    // return the full text so callers can store it for display.
630                    "identifier" | "attribute" => {
631                        return std::str::from_utf8(
632                            &source[inner_child.start_byte()..inner_child.end_byte()],
633                        )
634                        .ok()
635                        .map(str::to_owned);
636                    }
637                    // Call expression — ambiguous; treat as None.
638                    "call" => return None,
639                    _ => {}
640                }
641            }
642            return None;
643        }
644    }
645    None
646}
647
648/// Determine whether a Python `(assignment)` or JS/TS `(variable_declarator)`
649/// def captured by the def-query is spuriously nested inside a function body.
650///
651/// Cycle 10 W1 Front A — the Python def-query at `languages.rs:637` and the
652/// JS/TS variants at `:647`, `:657`, `:669` capture every `x = foo()` or
653/// `const x = foo()` site, even those inside function bodies, as defs. The
654/// `extract_calls` smallest-enclosing rule (`repo_map.rs:827-832`) then steals
655/// every call out of the surrounding function into the nested assignment def.
656/// Result: BFS terminates early because function defs have no outgoing edges.
657///
658/// Fix: walk up the AST from the def node. If we hit a function-body context
659/// (Python `function_definition`, JS/TS `function_declaration`,
660/// `function_expression`, `arrow_function`, `method_definition`,
661/// `generator_function_declaration`, `generator_function`) before reaching the
662/// module root, the def is spurious and must be dropped.
663///
664/// Module-level constants (`MAX = 4096`) and class-attribute assignments
665/// (`class Foo: bar = make_bar()`) — both of which are legitimate defs — are
666/// preserved because their ancestor chain contains only `module` /
667/// `class_definition` / `class_body` / `block` / `program` nodes, never a
668/// function-body container.
669fn is_spurious_nested_binding_def(kind: &str, node: tree_sitter::Node<'_>) -> bool {
670    // Only the assignment / variable_declarator captures are at risk.
671    // Other def kinds (function_definition, class_definition, method_definition,
672    // function_declaration, class_declaration, type_alias_declaration, etc.)
673    // are always legitimate at any nesting depth (e.g. nested functions,
674    // methods inside classes).
675    if !matches!(kind, "assignment" | "variable_declarator") {
676        return false;
677    }
678    let mut cur = node.parent();
679    while let Some(parent) = cur {
680        match parent.kind() {
681            // Function-body containers: any def captured beneath one of these
682            // is a local-variable binding, not a module/class-level def.
683            "function_definition"           // Python
684            | "function_declaration"        // JS / TS
685            | "function_expression"         // JS / TS
686            | "arrow_function"              // JS / TS
687            | "method_definition"           // JS / TS
688            | "generator_function_declaration"
689            | "generator_function" => return true,
690            // Top-level containers — we reached the module without finding a
691            // function ancestor. The def is legitimate.
692            "module" | "program" => return false,
693            _ => {}
694        }
695        cur = parent.parent();
696    }
697    false
698}
699
700/// Extract definitions from a source file using tree-sitter.
701fn extract_definitions(source: &str, config: &languages::LangConfig) -> Vec<Definition> {
702    let mut parser = Parser::new();
703    if parser.set_language(&config.language).is_err() {
704        return vec![];
705    }
706    let Some(tree) = parser.parse(source, None) else {
707        return vec![];
708    };
709
710    let mut cursor = QueryCursor::new();
711    let mut defs = Vec::new();
712    let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
713
714    while let Some(m) = matches.next() {
715        let mut name = String::new();
716        let mut def_node = None;
717
718        for cap in m.captures {
719            let cap_name = &config.query.capture_names()[cap.index as usize];
720            if *cap_name == "name" {
721                name = source[cap.node.start_byte()..cap.node.end_byte()].to_string();
722            } else if *cap_name == "def" {
723                def_node = Some(cap.node);
724            }
725        }
726
727        if let Some(node) = def_node {
728            // Cycle 10 W1 Front A: drop spurious nested assignment / variable_declarator
729            // defs that would steal calls from the enclosing function.
730            if is_spurious_nested_binding_def(node.kind(), node) {
731                continue;
732            }
733            let scope = crate::chunk::build_scope_chain(node, source);
734            let signature = crate::chunk::extract_signature(node, source);
735            #[expect(clippy::cast_possible_truncation, reason = "line numbers fit in u32")]
736            let start_line = node.start_position().row as u32 + 1;
737            #[expect(clippy::cast_possible_truncation, reason = "line numbers fit in u32")]
738            let end_line = node.end_position().row as u32 + 1;
739            #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
740            let start_byte = node.start_byte() as u32;
741            #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
742            let end_byte = node.end_byte() as u32;
743            // C1 (4.1.1): For Python `decorated_definition` nodes, extract the
744            // decorator name and compute the LSP kind at AST-parse time so the
745            // projection site in `render_json_budgeted` does not need to re-parse.
746            let (decorator, lsp_kind_hint) = if node.kind() == "decorated_definition" {
747                let dec = extract_first_decorator_name(&node, source.as_bytes());
748                let kind_hint = languages::lsp_symbol_kind_for_decorated_definition(
749                    dec.as_deref().unwrap_or(""),
750                );
751                (dec, Some(kind_hint))
752            } else {
753                (None, None)
754            };
755            defs.push(Definition {
756                name,
757                kind: node.kind().to_string(),
758                start_line,
759                end_line,
760                scope,
761                signature,
762                start_byte,
763                end_byte,
764                calls: vec![],
765                decorator,
766                lsp_kind_hint,
767            });
768        }
769    }
770
771    defs
772}
773
774// ── Call Extraction & Resolution ────────────────────────────────────
775
776/// Tiebreak priority for def attribution when two defs share the same byte span.
777///
778/// Returns `0` for function-like defs (lowest value = wins in `min_by_key`) and
779/// `1` for structural container defs (class bodies, impl blocks, etc.).
780///
781/// This resolves the Python case where the class body `block` and the first
782/// `function_definition` inside it occupy identical byte ranges; calls inside
783/// the function body should be attributed to the function, not the class block.
784fn is_callable_def_priority(kind: &str) -> u8 {
785    match kind {
786        // Function / method defs: these are the correct attribution targets.
787        "function_item"
788        | "function_definition"
789        | "function_declaration"
790        | "function_signature_item"
791        | "method_definition"
792        | "method_declaration"
793        | "method" => 0,
794        // Structural containers: class body blocks, impl items, etc.
795        // Prefer function-like defs over these when byte ranges tie.
796        _ => 1,
797    }
798}
799
800/// Extract call sites from a source file and assign them to definitions.
801///
802/// Uses the language's call query to find all call expressions, then
803/// assigns each call to the definition whose byte range contains it.
804/// Calls outside any definition body (module-level) are ignored.
805///
806/// For Rust scoped calls (`a::b::foo()`), the `@callee` capture returns the
807/// full `scoped_identifier` node. This function splits it into:
808/// - `name` = bare trailing identifier (`"foo"`)
809/// - `qualified_path` = `Some("a::b::foo")` for disambiguation in `resolve_calls`.
810///
811/// For method calls (`x.method()`), `receiver_type` is inferred from local
812/// context (parameter annotations, let-bindings, impl blocks). See
813/// [`infer_receiver_types`] for the heuristic.
814fn extract_calls(source: &str, call_config: &languages::CallConfig, defs: &mut [Definition]) {
815    let mut parser = Parser::new();
816    if parser.set_language(&call_config.language).is_err() {
817        return;
818    }
819    let Some(tree) = parser.parse(source, None) else {
820        return;
821    };
822
823    // Build receiver-type map: byte_offset_of_call → receiver_type_string.
824    // Done once per file to amortise the tree walk cost.
825    let receiver_map = infer_receiver_types(source, &tree, &call_config.language);
826
827    // HCL: run the HCL-specific call-edge extractor as a post-pass so the
828    // terraform_remote_state references and module blocks contribute edges
829    // that the generic function_call query cannot capture (R2 + R3, Wave 3).
830    if languages::is_hcl_language(&call_config.language) {
831        extract_hcl_call_edges(source, tree.root_node(), defs);
832    }
833
834    // C / C++ (I#55, 4.1.5): emit synthetic call-graph edges from
835    // struct-literal initializer fnptrs. C codebases dispatch via tables of
836    // function pointers (Linux `file_operations`, redis `redisCommandTable`,
837    // libuv handle vtables) — the generic call-expression query cannot
838    // capture these because there is no syntactic `f()` call site; the
839    // function is referenced by bare identifier inside `{ ... }`. Without
840    // this post-pass, every implementation referenced exclusively via such
841    // a table appears dead (Part XI §XI.4: kernel mega-cluster collapse on
842    // Linux, command-implementation collapse on redis).
843    if languages::is_c_language(&call_config.language)
844        || languages::is_cpp_language(&call_config.language)
845    {
846        extract_c_struct_init_edges(source, tree.root_node(), defs);
847    }
848
849    let mut cursor = QueryCursor::new();
850    let mut matches = cursor.matches(&call_config.query, tree.root_node(), source.as_bytes());
851
852    while let Some(m) = matches.next() {
853        let mut full_callee_text = None;
854        let mut call_byte = 0u32;
855
856        for cap in m.captures {
857            let cap_name = &call_config.query.capture_names()[cap.index as usize];
858            if *cap_name == "callee" {
859                full_callee_text =
860                    Some(source[cap.node.start_byte()..cap.node.end_byte()].to_string());
861                #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
862                {
863                    call_byte = cap.node.start_byte() as u32;
864                }
865            }
866        }
867
868        if let Some(full_text) = full_callee_text {
869            // Split qualified path into bare name + optional qualifier.
870            let (name, qualified_path) = if full_text.contains("::") {
871                let bare = full_text
872                    .rsplit("::")
873                    .next()
874                    .unwrap_or(&full_text)
875                    .to_string();
876                (bare, Some(full_text))
877            } else {
878                (full_text, None)
879            };
880
881            // Look up receiver type from the pre-built map.
882            let receiver_type = receiver_map.get(&call_byte).cloned();
883
884            // Assign to the most-specific (smallest byte range) enclosing definition.
885            // Using `find` (first match) was incorrect for nested defs: an `impl_item`
886            // wrapping a `function_item` both contain the call site, but the
887            // `function_item` is the correct granularity for method attribution.
888            //
889            // Tiebreak: when two defs have equal byte spans (as happens in Python where
890            // the class body `block` and its first `function_definition` share the same
891            // start/end bytes), prefer function-like defs over structural container defs.
892            // `is_callable_def` returns 0 for function-like kinds (sorts first in min_by_key).
893            let enclosing_idx = defs
894                .iter()
895                .enumerate()
896                .filter(|(_, d)| d.start_byte <= call_byte && call_byte < d.end_byte)
897                .min_by_key(|(_, d)| (d.end_byte - d.start_byte, is_callable_def_priority(&d.kind)))
898                .map(|(i, _)| i);
899
900            if let Some(idx) = enclosing_idx {
901                // Skip self-recursive calls (compare bare name to def name).
902                if defs[idx].name != name {
903                    defs[idx].calls.push(CallRef {
904                        name,
905                        qualified_path,
906                        receiver_type,
907                        byte_offset: call_byte,
908                        resolved: None,
909                    });
910                }
911            }
912            // Calls outside any definition are ignored (module-level init).
913        }
914    }
915
916    // JS / TS / TSX (B-0005): attribute closure-argument call edges to the
917    // nearest named enclosing function. Arrow functions and function
918    // expressions passed as arguments (e.g. `useCallback(() => fn(), [deps])`
919    // or `setTimeout(() => fn(), 0)`) are "passthrough" scopes — their inner
920    // calls bubble up to the nearest `function_declaration` or
921    // `method_definition` ancestor. Named const-assigned arrows (`const f =
922    // () => ...`) are NOT passthrough; they are their own def targets.
923    //
924    // This runs as a post-pass so that the main-loop dedup check for existing
925    // (byte_offset, name) pairs can prevent duplicates from being emitted when
926    // a call is already attributed to the correct named function.
927    if is_js_or_ts_language(&call_config.language) {
928        extract_js_closure_call_edges(source, tree.root_node(), defs);
929    }
930}
931
932// ── JS / TS closure call-edge attribution ────────────────────────────────────
933
934/// Returns `true` if `lang` is one of the JavaScript / TypeScript grammars.
935///
936/// Uses the same node-kind-count proxy as [`languages::is_rust_language`].
937/// JS, TS, and TSX all need closure attribution (B-0005).
938fn is_js_or_ts_language(lang: &tree_sitter::Language) -> bool {
939    let js_lang: tree_sitter::Language = tree_sitter_javascript::LANGUAGE.into();
940    let ts_lang: tree_sitter::Language = tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into();
941    let tsx_lang: tree_sitter::Language = tree_sitter_typescript::LANGUAGE_TSX.into();
942    let matches = |reference: tree_sitter::Language| {
943        lang.abi_version() == reference.abi_version()
944            && lang.node_kind_count() == reference.node_kind_count()
945    };
946    matches(js_lang) || matches(ts_lang) || matches(tsx_lang)
947}
948
949/// Returns `true` if `node` is an `arrow_function` or `function_expression`
950/// that is passed directly as an argument to another call expression.
951///
952/// The check is: the node's immediate parent is an `arguments` node.
953/// This distinguishes "argument closures" (passthrough for call attribution)
954/// from "named const arrows" (`const f = () => ...`) whose parent is a
955/// `variable_declarator`.
956fn is_argument_closure(node: tree_sitter::Node<'_>) -> bool {
957    matches!(node.kind(), "arrow_function" | "function_expression")
958        && node.parent().is_some_and(|p| p.kind() == "arguments")
959}
960
961/// Walk up the tree from `node`, passing through all transparent nodes
962/// (`arrow_function`, `function_expression`, `statement_block`, `arguments`,
963/// `call_expression`, `variable_declarator`, `lexical_declaration`,
964/// `expression_statement`, etc.) until we reach a `function_declaration` or
965/// `method_definition`, which is the nearest named enclosing function.
966///
967/// Returns `None` if no named function is found above `node` (e.g. the
968/// closure lives at module/file scope).
969fn nearest_named_fn_ancestor(node: tree_sitter::Node<'_>) -> Option<tree_sitter::Node<'_>> {
970    let mut current = node.parent()?;
971    loop {
972        match current.kind() {
973            // Named function boundaries: stop here.
974            "function_declaration" | "method_definition" => return Some(current),
975            // All other node kinds are transparent: keep climbing.
976            //
977            // The explicit list of transparent kinds (closures, declarations,
978            // control-flow, etc.) is documented here for readability, but the
979            // wildcard covers any new node kinds added in future grammar
980            // versions — the walk is intentionally permissive so that
981            // attribution never silently breaks on grammar updates.
982            //
983            // Transparent: "arrow_function", "function_expression",
984            // "statement_block", "arguments", "call_expression",
985            // "member_expression", "variable_declarator", "lexical_declaration",
986            // "expression_statement", "return_statement", "await_expression",
987            // "class_body", "class_declaration", "export_statement",
988            // "if_statement", "while_statement", "for_statement", etc.
989            _ => {}
990        }
991        match current.parent() {
992            Some(p) => current = p,
993            None => return None,
994        }
995    }
996}
997
998/// Collect all `call_expression` callee names that appear directly inside
999/// `closure` (an `arrow_function` or `function_expression`), visiting only
1000/// one level of closure depth (recursive closures are handled by the outer
1001/// DFS in [`extract_js_closure_call_edges`]).
1002///
1003/// Returns a `Vec` of `(callee_name, byte_offset)` pairs matching the callee
1004/// capture of the JS call query: either a bare `identifier` or the
1005/// `property_identifier` from a `member_expression`.
1006fn collect_calls_in_closure<'a>(
1007    source: &'a str,
1008    closure: tree_sitter::Node<'a>,
1009) -> Vec<(String, u32)> {
1010    let mut results = Vec::new();
1011    let mut stack: Vec<tree_sitter::Node<'_>> = Vec::new();
1012    // Start from the closure body, not the closure node itself.
1013    let body = closure.child_by_field_name("body").unwrap_or(closure);
1014    let mut cursor = body.walk();
1015    for child in body.children(&mut cursor) {
1016        stack.push(child);
1017    }
1018
1019    while let Some(node) = stack.pop() {
1020        if node.kind() == "call_expression" {
1021            // Extract callee: either `function: identifier` or
1022            // `function: member_expression property: property_identifier`.
1023            if let Some(fn_node) = node.child_by_field_name("function") {
1024                let callee_opt: Option<(String, u32)> = match fn_node.kind() {
1025                    "identifier" => {
1026                        let name = source[fn_node.start_byte()..fn_node.end_byte()].to_string();
1027                        #[expect(
1028                            clippy::cast_possible_truncation,
1029                            reason = "byte offsets fit in u32"
1030                        )]
1031                        let byte = fn_node.start_byte() as u32;
1032                        Some((name, byte))
1033                    }
1034                    "member_expression" => fn_node.child_by_field_name("property").map(|prop| {
1035                        let name = source[prop.start_byte()..prop.end_byte()].to_string();
1036                        #[expect(
1037                            clippy::cast_possible_truncation,
1038                            reason = "byte offsets fit in u32"
1039                        )]
1040                        let byte = prop.start_byte() as u32;
1041                        (name, byte)
1042                    }),
1043                    _ => None,
1044                };
1045                if let Some(pair) = callee_opt {
1046                    results.push(pair);
1047                }
1048            }
1049        }
1050        // Recurse into children, but do NOT descend into nested
1051        // arrow_function / function_expression nodes — those are either
1052        // handled by the outer DFS (if they are argument closures) or are
1053        // named const-arrows (which are their own defs and handled by the
1054        // main loop).
1055        if !matches!(node.kind(), "arrow_function" | "function_expression") {
1056            let mut c = node.walk();
1057            for child in node.children(&mut c) {
1058                stack.push(child);
1059            }
1060        }
1061    }
1062    results
1063}
1064
1065/// Post-pass for JS / TS / TSX files.
1066///
1067/// Two sub-passes:
1068///
1069/// **Pass 1 — closure-argument attribution**: For every `arrow_function` or
1070/// `function_expression` that is passed as an argument to a call (i.e., its
1071/// parent is an `arguments` node), attributes the inner calls to the nearest
1072/// named enclosing `function_declaration` or `method_definition`.
1073///
1074/// **Pass 2 — variable-declarator propagation**: For every `variable_declarator`
1075/// def that sits inside a named function, propagates all its calls to the
1076/// enclosing named function. This captures the outer call itself (e.g.
1077/// `useCallback(...)` in `const handler = useCallback(...)`) which the main
1078/// loop attributed to `handler` (the smallest enclosing def) but which also
1079/// belongs to the enclosing React component.
1080///
1081/// Both passes use `(byte_offset, name)` deduplication so calls are never
1082/// duplicated on a single def.
1083///
1084/// This fixes B-0005: React hooks (`useCallback`, `useMemo`, `useEffect`),
1085/// timer APIs (`setTimeout`, `setInterval`), array methods
1086/// (`.map`, `.filter`, `.forEach`, `.reduce`), and Express middleware
1087/// (`app.use`, `app.get`) all pass closures as arguments. Without this
1088/// post-pass, every function called INSIDE such a closure appears dead
1089/// because the call edge is attributed to the anonymous closure (which has
1090/// no def of its own) rather than to the enclosing named function.
1091///
1092/// **Rule** (from DESIGN §B-0005): anonymous closures and arrows are
1093/// "passthrough". Walk up to the nearest `function_declaration` /
1094/// `method_definition`. If none is found (closure at module scope), no edge
1095/// is emitted — matching the existing behaviour for top-level JS calls.
1096fn extract_js_closure_call_edges(
1097    source: &str,
1098    root: tree_sitter::Node<'_>,
1099    defs: &mut [Definition],
1100) {
1101    // Pass 1: walk tree — attribute calls inside argument-closure bodies to
1102    // the nearest named enclosing function.
1103    {
1104        let mut stack: Vec<tree_sitter::Node<'_>> = vec![root];
1105        while let Some(node) = stack.pop() {
1106            if is_argument_closure(node) {
1107                // Find the nearest named function ancestor.
1108                if let Some(named_fn) = nearest_named_fn_ancestor(node) {
1109                    // Identify which def this named function corresponds to by
1110                    // matching byte range and kind.
1111                    #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
1112                    let fn_start = named_fn.start_byte() as u32;
1113                    #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
1114                    let fn_end = named_fn.end_byte() as u32;
1115
1116                    let def_idx = defs.iter().position(|d| {
1117                        d.start_byte == fn_start
1118                            && d.end_byte == fn_end
1119                            && matches!(
1120                                d.kind.as_str(),
1121                                "function_declaration" | "method_definition"
1122                            )
1123                    });
1124
1125                    if let Some(idx) = def_idx {
1126                        // Collect calls inside this closure (one level deep;
1127                        // nested closures are handled by their own DFS iteration).
1128                        let inner_calls = collect_calls_in_closure(source, node);
1129                        for (name, byte_offset) in inner_calls {
1130                            // Skip self-recursive calls.
1131                            if defs[idx].name == name {
1132                                continue;
1133                            }
1134                            // Skip duplicate edges already present in the def.
1135                            if defs[idx]
1136                                .calls
1137                                .iter()
1138                                .any(|c| c.byte_offset == byte_offset && c.name == name)
1139                            {
1140                                continue;
1141                            }
1142                            defs[idx].calls.push(CallRef {
1143                                name,
1144                                qualified_path: None,
1145                                receiver_type: None,
1146                                byte_offset,
1147                                resolved: None,
1148                            });
1149                        }
1150                    }
1151                }
1152            }
1153            // Recurse into all children.
1154            let mut cursor = node.walk();
1155            for child in node.children(&mut cursor) {
1156                stack.push(child);
1157            }
1158        }
1159    }
1160
1161    // Pass 2: propagate calls from variable_declarator defs to their enclosing
1162    // named function defs.
1163    //
1164    // When a JS pattern like `const handler = useCallback(...)` creates a
1165    // `variable_declarator` def named `handler`, the main loop attributes
1166    // calls like `useCallback(...)` to `handler` (the smallest enclosing def).
1167    // Those calls also semantically belong to the enclosing named function
1168    // (e.g. `Component`) because they execute in its runtime context.
1169    //
1170    // We collect (variable_declarator_idx, enclosing_named_fn_idx) pairs first
1171    // to avoid mutating `defs` while iterating over it.
1172    let propagation_pairs: Vec<(usize, usize)> = defs
1173        .iter()
1174        .enumerate()
1175        .filter(|(_, d)| d.kind == "variable_declarator")
1176        .filter_map(|(var_idx, var_def)| {
1177            // Find the smallest named function def that strictly contains this
1178            // variable_declarator (not equal byte range — strict containment).
1179            let enclosing = defs
1180                .iter()
1181                .enumerate()
1182                .filter(|(enc_idx, enc)| {
1183                    *enc_idx != var_idx
1184                        && matches!(
1185                            enc.kind.as_str(),
1186                            "function_declaration" | "method_definition"
1187                        )
1188                        && enc.start_byte <= var_def.start_byte
1189                        && var_def.end_byte <= enc.end_byte
1190                })
1191                .min_by_key(|(_, enc)| enc.end_byte - enc.start_byte)
1192                .map(|(i, _)| i);
1193            enclosing.map(|enc_idx| (var_idx, enc_idx))
1194        })
1195        .collect();
1196
1197    for (var_idx, enc_idx) in propagation_pairs {
1198        // Clone the calls to propagate (avoiding borrow-checker conflict).
1199        let calls_to_propagate: Vec<(String, u32)> = defs[var_idx]
1200            .calls
1201            .iter()
1202            .map(|c| (c.name.clone(), c.byte_offset))
1203            .collect();
1204        for (name, byte_offset) in calls_to_propagate {
1205            // Skip self-recursive edges.
1206            if defs[enc_idx].name == name {
1207                continue;
1208            }
1209            // Skip duplicates.
1210            if defs[enc_idx]
1211                .calls
1212                .iter()
1213                .any(|c| c.byte_offset == byte_offset && c.name == name)
1214            {
1215                continue;
1216            }
1217            defs[enc_idx].calls.push(CallRef {
1218                name,
1219                qualified_path: None,
1220                receiver_type: None,
1221                byte_offset,
1222                resolved: None,
1223            });
1224        }
1225    }
1226}
1227
1228// HCL: post-pass call-edge extraction for terraform_remote_state and module
1229// blocks. These are not function calls — they are HCL-specific structural
1230// references to other Terraform modules — so the generic
1231// `(function_call (identifier) @callee) @call` pattern in
1232// `call_query_for_extension("tf")` cannot capture them. This helper runs
1233// once per HCL file inside `extract_calls` (R2 + R3, Wave 3).
1234
1235/// Walk an HCL parse tree and emit CallRef entries for:
1236///
1237/// 1. `data.terraform_remote_state.<NAME>.outputs.<ATTR>` expressions:
1238///    one CallRef per reference, with `name = NAME` and
1239///    `qualified_path = Some("terraform_remote_state.NAME")`. These
1240///    connect the current file to the named remote-state module's outputs.
1241///
1242/// 2. `module "X" { source = "../X" }` blocks: one CallRef with
1243///    `name = X` (the label) and `qualified_path = Some("module.X")`.
1244///    The module reference connects to the module's directory in
1245///    `resolve_import` (HCL module-source resolution is not implemented
1246///    yet — the qualified_path carrier is the contract; resolve adds the
1247///    file lookup).
1248///
1249/// Each emitted CallRef is attached to the smallest enclosing definition
1250/// by byte range — matching the same heuristic used by `extract_calls`.
1251fn extract_hcl_call_edges(source: &str, root: tree_sitter::Node<'_>, defs: &mut [Definition]) {
1252    // Walk all named descendants iteratively.
1253    let mut stack: Vec<tree_sitter::Node<'_>> = vec![root];
1254    while let Some(node) = stack.pop() {
1255        // Defer to a function-style helper per node kind.
1256        match node.kind() {
1257            "expression" => hcl_visit_expression(source, node, defs),
1258            "block" => hcl_visit_block(source, node, defs),
1259            _ => {}
1260        }
1261        // Recurse into named children.
1262        let mut cursor = node.walk();
1263        for child in node.children(&mut cursor) {
1264            if child.is_named() {
1265                stack.push(child);
1266            }
1267        }
1268    }
1269}
1270
1271/// Inspect an HCL `expression` node for the
1272/// `data.terraform_remote_state.<NAME>.outputs.<ATTR>` reference pattern.
1273///
1274/// The expression tree looks like:
1275/// ```text
1276/// expression
1277///   variable_expr
1278///     identifier "data"
1279///   get_attr
1280///     identifier "terraform_remote_state"
1281///   get_attr
1282///     identifier "<NAME>"
1283///   get_attr
1284///     identifier "outputs"
1285///   get_attr
1286///     identifier "<ATTR>"
1287/// ```
1288fn hcl_visit_expression(source: &str, node: tree_sitter::Node<'_>, defs: &mut [Definition]) {
1289    // Collect children: must be `variable_expr` (with identifier="data")
1290    // followed by a chain of `get_attr` nodes (each with an `identifier` child).
1291    let mut cursor = node.walk();
1292    let mut child_iter = node.children(&mut cursor);
1293    let Some(first) = child_iter.next() else {
1294        return;
1295    };
1296    if first.kind() != "variable_expr" {
1297        return;
1298    }
1299    let Some(first_id) = first.child_by_field_name("name").or_else(|| {
1300        // Fallback: find first named child that's an identifier.
1301        let mut c = first.walk();
1302        first.children(&mut c).find(|n| n.kind() == "identifier")
1303    }) else {
1304        return;
1305    };
1306    if &source[first_id.start_byte()..first_id.end_byte()] != "data" {
1307        return;
1308    }
1309
1310    // Collect identifiers from the chain of get_attr.
1311    let mut chain: Vec<String> = Vec::new();
1312    for child in child_iter {
1313        if child.kind() != "get_attr" {
1314            return; // not a pure attribute chain
1315        }
1316        let mut gc = child.walk();
1317        let id = child.children(&mut gc).find(|n| n.kind() == "identifier");
1318        let Some(id_node) = id else { return };
1319        chain.push(source[id_node.start_byte()..id_node.end_byte()].to_string());
1320    }
1321
1322    // Expect: terraform_remote_state, <NAME>, outputs, <ATTR>
1323    if chain.len() < 2 || chain[0] != "terraform_remote_state" {
1324        return;
1325    }
1326    let name = chain[1].clone();
1327    let qualified_path = format!("terraform_remote_state.{name}");
1328
1329    #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
1330    let call_byte = node.start_byte() as u32;
1331    attach_hcl_call(defs, call_byte, name, Some(qualified_path));
1332}
1333
1334/// Inspect an HCL `block` node for the `module "X" { source = "../X" }`
1335/// pattern. Emits one CallRef per matching block.
1336fn hcl_visit_block(source: &str, node: tree_sitter::Node<'_>, defs: &mut [Definition]) {
1337    // First child must be identifier="module".
1338    let mut cursor = node.walk();
1339    let children: Vec<tree_sitter::Node<'_>> = node.children(&mut cursor).collect();
1340    let Some(first) = children.first() else {
1341        return;
1342    };
1343    if first.kind() != "identifier" || &source[first.start_byte()..first.end_byte()] != "module" {
1344        return;
1345    }
1346    // Next child should be a string_lit (the module label).
1347    let label_node = children.iter().find(|c| c.kind() == "string_lit");
1348    let Some(label_node) = label_node else {
1349        return;
1350    };
1351    let mut lc = label_node.walk();
1352    let template = label_node
1353        .children(&mut lc)
1354        .find(|n| n.kind() == "template_literal");
1355    let Some(template) = template else {
1356        return;
1357    };
1358    let label = source[template.start_byte()..template.end_byte()].to_string();
1359    let qualified_path = format!("module.{label}");
1360
1361    #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
1362    let call_byte = node.start_byte() as u32;
1363    attach_hcl_call(defs, call_byte, label, Some(qualified_path));
1364}
1365
1366/// Attach a synthesized HCL CallRef to the smallest enclosing definition.
1367/// Mirrors the byte-range attribution from `extract_calls`.
1368fn attach_hcl_call(
1369    defs: &mut [Definition],
1370    call_byte: u32,
1371    name: String,
1372    qualified_path: Option<String>,
1373) {
1374    let enclosing_idx = defs
1375        .iter()
1376        .enumerate()
1377        .filter(|(_, d)| d.start_byte <= call_byte && call_byte < d.end_byte)
1378        .min_by_key(|(_, d)| (d.end_byte - d.start_byte, is_callable_def_priority(&d.kind)))
1379        .map(|(i, _)| i);
1380    if let Some(idx) = enclosing_idx {
1381        // Skip self-recursive emission (would happen if the enclosing def
1382        // happens to share the same `name` as the synthesized callee).
1383        if defs[idx].name != name {
1384            defs[idx].calls.push(CallRef {
1385                name,
1386                qualified_path,
1387                receiver_type: None,
1388                byte_offset: call_byte,
1389                resolved: None,
1390            });
1391        }
1392    }
1393}
1394
1395/// Walk a C / C++ parse tree and emit synthetic CallRef entries for every
1396/// function identifier appearing inside a struct or array initializer
1397/// literal (I#55, 4.1.5).
1398///
1399/// Two syntactic forms produce edges:
1400///
1401/// 1. **Designated initializer** — `(initializer_pair value: (identifier))`
1402///    — corresponds to `.field = funcname` syntax (Linux `file_operations`,
1403///    ALSA `snd_pcm_ops`, etc):
1404///    ```c
1405///    static const struct file_operations my_fops = {
1406///        .read = my_read,    //  ← edge: my_fops → my_read
1407///        .write = my_write,  //  ← edge: my_fops → my_write
1408///    };
1409///    ```
1410///
1411/// 2. **Positional initializer** — `(initializer_list (identifier))` —
1412///    corresponds to bare identifier slots inside `{ ... }` (redis
1413///    `redisCommandTable`, libuv handle vtables):
1414///    ```c
1415///    struct redisCommand cmds[] = {
1416///        {"get", getCommand, 2},   //  ← edge: cmds → getCommand
1417///        {"set", setCommand, -3},  //  ← edge: cmds → setCommand
1418///    };
1419///    ```
1420///
1421/// Each emitted CallRef is attached to the smallest enclosing definition by
1422/// byte range (same heuristic as `extract_calls` / `attach_hcl_call`). For
1423/// the typical use case the enclosing def is the array/struct declaration
1424/// itself (e.g. `my_fops` or `cmds`). When the enclosing def is itself the
1425/// referenced function (e.g. a struct field designator inside a function
1426/// body) the self-recursive edge is suppressed.
1427///
1428/// Non-identifier initializer values (string literals, integers, nested
1429/// braces) are skipped by tree-sitter's `kind() == "identifier"` filter, so
1430/// no false-positive edges to undefined symbols are emitted. The resolver
1431/// (`resolve_calls`) then either binds the identifier to a real function
1432/// def (preserved as a real edge) or leaves `resolved = None` (dropped at
1433/// edge-construction time, matching how all unresolved CallRefs behave).
1434fn extract_c_struct_init_edges(source: &str, root: tree_sitter::Node<'_>, defs: &mut [Definition]) {
1435    // Iterative DFS to avoid stack-blowup on deeply-nested initializer
1436    // tables (some kernel drivers nest 4–5 levels of struct-of-struct).
1437    let mut stack: Vec<tree_sitter::Node<'_>> = vec![root];
1438    while let Some(node) = stack.pop() {
1439        match node.kind() {
1440            // Designated initializer: `.field = funcname`. The grammar
1441            // exposes the rhs as a positional child, but using the `value:`
1442            // field selector picks it unambiguously when present and falls
1443            // back to the last named child otherwise. We only consume bare
1444            // identifier values — string_literal, number_literal, and
1445            // nested initializer_list are dispatched separately (the
1446            // recursion below re-enters nested initializer_list nodes).
1447            "initializer_pair" => {
1448                let value_node = node.child_by_field_name("value").or_else(|| {
1449                    // Fallback: last named child that isn't a designator.
1450                    let mut c = node.walk();
1451                    node.children(&mut c)
1452                        .filter(|n| n.is_named() && n.kind() != "field_designator")
1453                        .last()
1454                });
1455                if let Some(v) = value_node
1456                    && v.kind() == "identifier"
1457                {
1458                    emit_c_init_edge(source, v, defs);
1459                }
1460            }
1461            // Positional initializer: bare identifier directly inside an
1462            // `initializer_list`. We do NOT recurse from here to pick up
1463            // identifiers — the outer DFS already visits every node, and
1464            // emitting on direct identifier children of `initializer_list`
1465            // covers `{"name", funcname, 2}` patterns. Nested
1466            // `initializer_list` children (e.g. `{{...},{...}}`) are
1467            // popped onto the stack and processed in their own iteration.
1468            "initializer_list" => {
1469                let mut c = node.walk();
1470                for child in node.children(&mut c) {
1471                    if child.kind() == "identifier" {
1472                        emit_c_init_edge(source, child, defs);
1473                    }
1474                }
1475            }
1476            _ => {}
1477        }
1478        // Recurse into named children. The DFS visits the entire subtree
1479        // so nested `initializer_pair` / `initializer_list` nodes are
1480        // reached without special handling.
1481        let mut cursor = node.walk();
1482        for child in node.children(&mut cursor) {
1483            if child.is_named() {
1484                stack.push(child);
1485            }
1486        }
1487    }
1488}
1489
1490/// Emit one synthetic CallRef edge for a C/C++ struct-literal fnptr
1491/// reference. `ident_node` must be an `identifier` node; its text becomes
1492/// the callee name. The edge is attached to the smallest enclosing
1493/// definition (typically the surrounding `declaration` def for the table
1494/// variable itself).
1495fn emit_c_init_edge(source: &str, ident_node: tree_sitter::Node<'_>, defs: &mut [Definition]) {
1496    let name = source[ident_node.start_byte()..ident_node.end_byte()].to_string();
1497    #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
1498    let call_byte = ident_node.start_byte() as u32;
1499
1500    let enclosing_idx = defs
1501        .iter()
1502        .enumerate()
1503        .filter(|(_, d)| d.start_byte <= call_byte && call_byte < d.end_byte)
1504        .min_by_key(|(_, d)| (d.end_byte - d.start_byte, is_callable_def_priority(&d.kind)))
1505        .map(|(i, _)| i);
1506
1507    let Some(idx) = enclosing_idx else {
1508        return;
1509    };
1510    // Skip self-recursive emission (would happen if the enclosing def is
1511    // itself the referenced function — e.g. a static initializer inside a
1512    // function body whose .field designates the same function).
1513    if defs[idx].name == name {
1514        return;
1515    }
1516    // Skip duplicate edges (same caller, same name, same byte offset) —
1517    // belt-and-braces against the DFS visiting an identifier through both
1518    // its parent initializer_pair and a containing initializer_list scan.
1519    if defs[idx]
1520        .calls
1521        .iter()
1522        .any(|c| c.byte_offset == call_byte && c.name == name)
1523    {
1524        return;
1525    }
1526    defs[idx].calls.push(CallRef {
1527        name,
1528        qualified_path: None,
1529        receiver_type: None,
1530        byte_offset: call_byte,
1531        resolved: None,
1532    });
1533}
1534
1535/// Infer method-call receiver types from local context within a parse tree.
1536///
1537/// Returns a map from `byte_offset_of_@callee_capture` to a receiver type string.
1538///
1539/// Dispatches to a language-specific collector:
1540///
1541/// - **Rust**: [`collect_rust_receiver_types`] — three heuristic cases:
1542///   1. `self.method()` inside `impl Foo { … }` → `"Foo"`.
1543///   2. `x.method()` where `x: Bar` is a function parameter → `"Bar"`.
1544///   3. `x.method()` after `let x = Foo::new()` → `"Foo"`.
1545///
1546/// - **Python**: [`collect_python_receiver_types`] — two heuristic cases:
1547///   1. `self.method()` inside a class method → class name from enclosing
1548///      `class_definition`.
1549///   2. `instance.method()` where `instance: ClassName` type annotation or
1550///      `instance = ClassName(...)` assignment is visible in the same scope.
1551///
1552/// - **Go**: [`collect_go_receiver_types`] — one heuristic case:
1553///   1. `recv.Method()` inside a `method_declaration` where `recv` is the
1554///      named receiver parameter → receiver type from the method signature.
1555///
1556/// This is heuristic, not type-inference-complete. Unknown/ambiguous cases
1557/// produce no entry in the map; `extract_calls` leaves those `receiver_type = None`.
1558fn infer_receiver_types(
1559    source: &str,
1560    tree: &tree_sitter::Tree,
1561    language: &tree_sitter::Language,
1562) -> HashMap<u32, String> {
1563    let mut map: HashMap<u32, String> = HashMap::new();
1564
1565    if languages::is_rust_language(language) {
1566        collect_rust_receiver_types(source, tree.root_node(), &mut map);
1567    } else if languages::is_python_language(language) {
1568        collect_python_receiver_types(source, tree.root_node(), &mut map);
1569    } else if languages::is_go_language(language) {
1570        collect_go_receiver_types(source, tree.root_node(), &mut map);
1571    }
1572    // Other languages: no receiver inference — leave map empty.
1573
1574    map
1575}
1576
1577/// Walk the Rust parse tree and fill `map` with receiver-type inference.
1578///
1579/// This is a recursive descent that tracks:
1580/// - The current `impl Foo` or `impl Foo for Bar` type name (for `self.*` calls).
1581/// - Parameter type annotations (for `x: SomeType` → `x` has type `SomeType`).
1582/// - Constructor let-bindings (`let x = Foo::new()` → `x` has type `Foo`).
1583fn collect_rust_receiver_types(
1584    source: &str,
1585    node: tree_sitter::Node<'_>,
1586    map: &mut HashMap<u32, String>,
1587) {
1588    // We use a stack-based walk to avoid deep recursion on large files.
1589    // Each stack entry carries (node, impl_type_context).
1590    let mut stack: Vec<(tree_sitter::Node<'_>, Option<String>)> = vec![(node, None)];
1591
1592    while let Some((n, impl_ctx)) = stack.pop() {
1593        match n.kind() {
1594            "impl_item" => {
1595                // Extract `impl Foo` or `impl Trait for Foo` → capture the `for` type.
1596                // tree-sitter-rust shape: `(impl_item type: (type_identifier) @type)` for
1597                // inherent impls, and `(impl_item trait: … type: (type_identifier) @type)`
1598                // for trait impls. Both have a child named `type`.
1599                let impl_type = extract_impl_self_type(source, n);
1600                let new_ctx = impl_type.or_else(|| impl_ctx.clone());
1601                let mut cursor = n.walk();
1602                for child in n.children(&mut cursor) {
1603                    stack.push((child, new_ctx.clone()));
1604                }
1605            }
1606            "function_item" => {
1607                // Build parameter bindings: (param_name → type_name).
1608                let param_types = extract_param_types(source, n);
1609                // Build let-binding type map from constructor calls.
1610                let let_types = extract_let_binding_types(source, n);
1611                // Annotate call sites within this function body.
1612                annotate_method_calls(
1613                    source,
1614                    n,
1615                    impl_ctx.as_deref(),
1616                    &param_types,
1617                    &let_types,
1618                    map,
1619                );
1620                // Do NOT recurse into function_item children with the outer stack —
1621                // function bodies are fully handled by annotate_method_calls.
1622                // (Nested fn items would re-enter via their own impl_item context.)
1623                // Push children with same impl_ctx so nested impl blocks are found.
1624                let mut cursor = n.walk();
1625                for child in n.children(&mut cursor) {
1626                    stack.push((child, impl_ctx.clone()));
1627                }
1628            }
1629            _ => {
1630                let mut cursor = n.walk();
1631                for child in n.children(&mut cursor) {
1632                    stack.push((child, impl_ctx.clone()));
1633                }
1634            }
1635        }
1636    }
1637}
1638
1639/// Extract the self type from an `impl_item` node.
1640///
1641/// For `impl Foo { … }` → `Some("Foo")`.
1642/// For `impl Trait for Foo { … }` → `Some("Foo")` (the concrete `for` type).
1643fn extract_impl_self_type(source: &str, impl_node: tree_sitter::Node<'_>) -> Option<String> {
1644    // tree-sitter-rust: impl_item has a field named "type" for the self type.
1645    // For `impl Foo for Bar { }`, "type" is Bar; for `impl Foo { }`, "type" is Foo.
1646    let type_node = impl_node.child_by_field_name("type")?;
1647    Some(source[type_node.start_byte()..type_node.end_byte()].to_string())
1648}
1649
1650/// Extract parameter name → type mappings from a function signature.
1651///
1652/// Handles `fn foo(x: Bar, y: Baz)` → `{"x": "Bar", "y": "Baz"}`.
1653/// The `self`/`&self`/`&mut self` parameter is skipped (handled via impl_ctx).
1654fn extract_param_types(source: &str, fn_node: tree_sitter::Node<'_>) -> HashMap<String, String> {
1655    let mut params: HashMap<String, String> = HashMap::new();
1656    let Some(params_node) = fn_node.child_by_field_name("parameters") else {
1657        return params;
1658    };
1659    let mut cursor = params_node.walk();
1660    for param in params_node.children(&mut cursor) {
1661        if param.kind() == "parameter" {
1662            // parameter has children: pattern (identifier) and type
1663            let mut param_name = None;
1664            let mut param_type = None;
1665            let mut pc = param.walk();
1666            for child in param.children(&mut pc) {
1667                match child.kind() {
1668                    "identifier" | "mutable_specifier" if param_name.is_none() => {
1669                        let text = source[child.start_byte()..child.end_byte()].to_string();
1670                        if text != "mut" {
1671                            param_name = Some(text);
1672                        }
1673                    }
1674                    "type_identifier"
1675                    | "generic_type"
1676                    | "reference_type"
1677                    | "scoped_type_identifier"
1678                        if param_type.is_none() =>
1679                    {
1680                        // Extract the base type identifier from potentially complex types.
1681                        param_type = Some(extract_base_type(source, child));
1682                    }
1683                    _ => {}
1684                }
1685            }
1686            if let (Some(name), Some(ty)) = (param_name, param_type)
1687                && !ty.is_empty()
1688            {
1689                params.insert(name, ty);
1690            }
1691        }
1692        // Also handle typed_pattern in newer grammars
1693        if param.kind() == "typed_pattern" {
1694            let mut name_part = None;
1695            let mut type_part = None;
1696            let mut pc = param.walk();
1697            for child in param.children(&mut pc) {
1698                if child.kind() == "identifier" && name_part.is_none() {
1699                    name_part = Some(source[child.start_byte()..child.end_byte()].to_string());
1700                } else if matches!(
1701                    child.kind(),
1702                    "type_identifier"
1703                        | "generic_type"
1704                        | "reference_type"
1705                        | "scoped_type_identifier"
1706                ) && type_part.is_none()
1707                {
1708                    type_part = Some(extract_base_type(source, child));
1709                }
1710            }
1711            if let (Some(name), Some(ty)) = (name_part, type_part)
1712                && !ty.is_empty()
1713            {
1714                params.insert(name, ty);
1715            }
1716        }
1717    }
1718    params
1719}
1720
1721/// Extract the base `TypeIdentifier` from a potentially complex type node.
1722///
1723/// For `Bar`, `&Bar`, `&mut Bar`, `Bar<T>` → returns `"Bar"`.
1724/// For `module::Bar` → returns `"Bar"` (bare name for matching).
1725fn extract_base_type(source: &str, node: tree_sitter::Node<'_>) -> String {
1726    match node.kind() {
1727        "type_identifier" => source[node.start_byte()..node.end_byte()].to_string(),
1728        "generic_type" | "reference_type" | "mutable_specifier" | "scoped_type_identifier" => {
1729            // Recurse to find the innermost type_identifier
1730            let mut cursor = node.walk();
1731            for child in node.children(&mut cursor) {
1732                let t = extract_base_type(source, child);
1733                if !t.is_empty() {
1734                    return t;
1735                }
1736            }
1737            String::new()
1738        }
1739        _ => {
1740            // For other nodes, try children
1741            let mut cursor = node.walk();
1742            for child in node.children(&mut cursor) {
1743                if child.kind() == "type_identifier" {
1744                    return source[child.start_byte()..child.end_byte()].to_string();
1745                }
1746            }
1747            String::new()
1748        }
1749    }
1750}
1751
1752/// Scan a function body for `let x = Foo::new()` patterns.
1753///
1754/// Returns a map from local variable name to the constructor type name.
1755/// E.g., `let x = Foo::new();` → `{"x": "Foo"}`.
1756fn extract_let_binding_types(
1757    source: &str,
1758    fn_node: tree_sitter::Node<'_>,
1759) -> HashMap<String, String> {
1760    let mut bindings: HashMap<String, String> = HashMap::new();
1761
1762    let Some(body) = fn_node.child_by_field_name("body") else {
1763        return bindings;
1764    };
1765
1766    // Walk the function body looking for let_declaration nodes.
1767    let mut stack = vec![body];
1768    while let Some(n) = stack.pop() {
1769        if n.kind() == "let_declaration" {
1770            // let_declaration: pattern (identifier) + value (call_expression or …)
1771            let mut binding_name = None;
1772            let mut constructor_type = None;
1773            let mut cursor = n.walk();
1774            for child in n.children(&mut cursor) {
1775                match child.kind() {
1776                    "identifier" if binding_name.is_none() => {
1777                        binding_name =
1778                            Some(source[child.start_byte()..child.end_byte()].to_string());
1779                    }
1780                    "call_expression" => {
1781                        // Look for `Foo::new()` or `Foo::from(…)` patterns.
1782                        // The function child of call_expression is a scoped_identifier.
1783                        if let Some(func) = child.child_by_field_name("function")
1784                            && func.kind() == "scoped_identifier"
1785                        {
1786                            // scoped_identifier path: `Foo::new` — extract head segment.
1787                            let full = source[func.start_byte()..func.end_byte()].to_string();
1788                            let head = full.split("::").next().unwrap_or("").to_string();
1789                            if !head.is_empty()
1790                                && head.chars().next().is_some_and(char::is_uppercase)
1791                            {
1792                                constructor_type = Some(head);
1793                            }
1794                        }
1795                    }
1796                    _ => {}
1797                }
1798            }
1799            if let (Some(name), Some(ty)) = (binding_name, constructor_type) {
1800                bindings.insert(name, ty);
1801            }
1802        }
1803        // Push children for recursive walk.
1804        let mut cursor = n.walk();
1805        for child in n.children(&mut cursor) {
1806            stack.push(child);
1807        }
1808    }
1809
1810    bindings
1811}
1812
1813/// Walk a function body and annotate method-call byte offsets with receiver types.
1814///
1815/// A "method call" in the ripvec call query is:
1816/// `(call_expression function: (field_expression field: (field_identifier) @callee))`
1817///
1818/// The receiver is the `value` child of `field_expression`. This function
1819/// checks whether the receiver is:
1820/// - `self` → use `impl_ctx` type.
1821/// - An identifier matching a parameter type in `param_types`.
1822/// - An identifier matching a constructor let-binding in `let_types`.
1823fn annotate_method_calls(
1824    source: &str,
1825    fn_node: tree_sitter::Node<'_>,
1826    impl_ctx: Option<&str>,
1827    param_types: &HashMap<String, String>,
1828    let_types: &HashMap<String, String>,
1829    map: &mut HashMap<u32, String>,
1830) {
1831    // Walk the entire function (including its body) looking for call_expression nodes.
1832    let mut stack = vec![fn_node];
1833    while let Some(n) = stack.pop() {
1834        if n.kind() == "call_expression"
1835            && let Some(func) = n.child_by_field_name("function")
1836            && func.kind() == "field_expression"
1837        {
1838            // field_expression: value (receiver) + field (method name identifier)
1839            if let (Some(recv), Some(field)) = (
1840                func.child_by_field_name("value"),
1841                func.child_by_field_name("field"),
1842            ) {
1843                let recv_text = source[recv.start_byte()..recv.end_byte()].to_string();
1844                let receiver_type = if recv_text == "self" || recv_text == "*self" {
1845                    impl_ctx.map(str::to_owned)
1846                } else {
1847                    // Strip ref sigils for lookup.
1848                    let base = recv_text
1849                        .trim_start_matches('*')
1850                        .trim_start_matches('&')
1851                        .trim();
1852                    param_types
1853                        .get(base)
1854                        .or_else(|| let_types.get(base))
1855                        .cloned()
1856                };
1857
1858                if let Some(ty) = receiver_type {
1859                    // The `@callee` capture byte offset is the start of the field node.
1860                    #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
1861                    let field_byte = field.start_byte() as u32;
1862                    map.insert(field_byte, ty);
1863                }
1864            }
1865        }
1866        let mut cursor = n.walk();
1867        for child in n.children(&mut cursor) {
1868            stack.push(child);
1869        }
1870    }
1871}
1872
1873// ── Python receiver-type heuristic ───────────────────────────────────
1874
1875/// Walk the Python parse tree and fill `map` with receiver-type inference.
1876///
1877/// Two heuristic cases:
1878///
1879/// 1. **`self.method()` inside a method** — when the `attribute` call receiver is
1880///    the literal text `self`, the receiver type is the name of the nearest
1881///    enclosing `class_definition`.
1882///
1883/// 2. **`instance.method()` with a type annotation or constructor call** —
1884///    when a function parameter has a PEP 484 annotation `param: ClassName` or
1885///    when a local assignment `param = ClassName(...)` precedes the call, the
1886///    receiver type is bound to `ClassName`.
1887///
1888/// The Python call query captures:
1889/// - `(call function: (attribute attribute: (identifier) @callee)) @call`
1890///
1891/// Within the `attribute` node, `value` is the receiver expression and
1892/// `attribute` is the method name (the `@callee` capture). The `@callee`
1893/// byte offset is the start of the `attribute` child identifier node.
1894fn collect_python_receiver_types(
1895    source: &str,
1896    root: tree_sitter::Node<'_>,
1897    map: &mut HashMap<u32, String>,
1898) {
1899    // Stack carries (node, class_ctx: Option<String>).
1900    // class_ctx is the name of the nearest enclosing class_definition.
1901    let mut stack: Vec<(tree_sitter::Node<'_>, Option<String>)> = vec![(root, None)];
1902
1903    while let Some((n, class_ctx)) = stack.pop() {
1904        match n.kind() {
1905            "class_definition" => {
1906                // Extract the class name from the `name` child.
1907                let class_name = n
1908                    .child_by_field_name("name")
1909                    .map(|c| source[c.start_byte()..c.end_byte()].to_string());
1910                let new_ctx = class_name.or_else(|| class_ctx.clone());
1911                let mut cursor = n.walk();
1912                for child in n.children(&mut cursor) {
1913                    stack.push((child, new_ctx.clone()));
1914                }
1915            }
1916            "function_definition" => {
1917                // Build parameter annotation map: param_name → type_name.
1918                let param_types = extract_python_param_types(source, n);
1919                // Build local assignment map: var_name → constructor type.
1920                let let_types = extract_python_assignment_types(source, n);
1921                // Annotate attribute call sites within this function body.
1922                annotate_python_method_calls(
1923                    source,
1924                    n,
1925                    class_ctx.as_deref(),
1926                    &param_types,
1927                    &let_types,
1928                    map,
1929                );
1930                // Push children with same class_ctx so nested classes are found.
1931                let mut cursor = n.walk();
1932                for child in n.children(&mut cursor) {
1933                    stack.push((child, class_ctx.clone()));
1934                }
1935            }
1936            _ => {
1937                let mut cursor = n.walk();
1938                for child in n.children(&mut cursor) {
1939                    stack.push((child, class_ctx.clone()));
1940                }
1941            }
1942        }
1943    }
1944}
1945
1946/// Extract Python parameter name → type annotation mappings.
1947///
1948/// Handles PEP 484 style: `def foo(self, x: Bar, y: Baz) -> ...`.
1949/// The `self` parameter is excluded (handled via class_ctx).
1950/// Returns `{"x": "Bar", "y": "Baz"}`.
1951fn extract_python_param_types(
1952    source: &str,
1953    fn_node: tree_sitter::Node<'_>,
1954) -> HashMap<String, String> {
1955    let mut params: HashMap<String, String> = HashMap::new();
1956    let Some(params_node) = fn_node.child_by_field_name("parameters") else {
1957        return params;
1958    };
1959
1960    // Parameters node children include `identifier`, `typed_parameter`,
1961    // `typed_default_parameter`, and others.
1962    let mut cursor = params_node.walk();
1963    for param in params_node.children(&mut cursor) {
1964        match param.kind() {
1965            "typed_parameter" => {
1966                // (typed_parameter (identifier) @name type: (type) @type)
1967                // First identifier child is the name; type child is the type.
1968                let mut name_text = None;
1969                let mut type_text = None;
1970                let mut pc = param.walk();
1971                for child in param.children(&mut pc) {
1972                    match child.kind() {
1973                        "identifier" if name_text.is_none() => {
1974                            let t = source[child.start_byte()..child.end_byte()].to_string();
1975                            if t != "self" && t != "cls" {
1976                                name_text = Some(t);
1977                            }
1978                        }
1979                        "type" | "identifier" | "attribute"
1980                            if type_text.is_none() && name_text.is_some() =>
1981                        {
1982                            // The type child in tree-sitter-python is a `type` node
1983                            // whose text is the annotation expression. Extract the
1984                            // base identifier (handle `Optional[Bar]`, `List[Bar]`, etc.)
1985                            type_text = Some(extract_python_base_type(source, child));
1986                        }
1987                        _ => {}
1988                    }
1989                }
1990                if let (Some(name), Some(ty)) = (name_text, type_text)
1991                    && !ty.is_empty()
1992                    && !ty.eq("self")
1993                    && !ty.eq("cls")
1994                {
1995                    params.insert(name, ty);
1996                }
1997            }
1998            "typed_default_parameter" => {
1999                // (typed_default_parameter name: (identifier) type: (type) value: …)
2000                let name_node = param.child_by_field_name("name");
2001                let type_node = param.child_by_field_name("type");
2002                if let (Some(nn), Some(tn)) = (name_node, type_node) {
2003                    let name = source[nn.start_byte()..nn.end_byte()].to_string();
2004                    if name != "self" && name != "cls" {
2005                        let ty = extract_python_base_type(source, tn);
2006                        if !ty.is_empty() {
2007                            params.insert(name, ty);
2008                        }
2009                    }
2010                }
2011            }
2012            _ => {}
2013        }
2014    }
2015    params
2016}
2017
2018/// Extract the base type name from a Python type annotation node.
2019///
2020/// For `Bar` → `"Bar"`. For `Optional[Bar]` or `List[Bar]` → `"Bar"`.
2021/// For `module.Class` → `"Class"` (bare name only).
2022fn extract_python_base_type(source: &str, node: tree_sitter::Node<'_>) -> String {
2023    match node.kind() {
2024        "identifier" => source[node.start_byte()..node.end_byte()].to_string(),
2025        // tree-sitter-python wraps annotations in a `type` node
2026        "type" => {
2027            let mut cursor = node.walk();
2028            for child in node.children(&mut cursor) {
2029                let t = extract_python_base_type(source, child);
2030                if !t.is_empty() {
2031                    return t;
2032                }
2033            }
2034            String::new()
2035        }
2036        // Generic alias: `Optional[Bar]` — the first identifier child is `Optional`,
2037        // the subscript child contains `Bar`. We want the subscript content.
2038        "subscript" => {
2039            // subscript has value (e.g. Optional) and subscript (e.g. Bar).
2040            // Return the subscript's base type (the inner type argument).
2041            if let Some(sub) = node.child_by_field_name("subscript") {
2042                return extract_python_base_type(source, sub);
2043            }
2044            // Fall back: first identifier
2045            let mut cursor = node.walk();
2046            for child in node.children(&mut cursor) {
2047                if child.kind() == "identifier" {
2048                    return source[child.start_byte()..child.end_byte()].to_string();
2049                }
2050            }
2051            String::new()
2052        }
2053        // Attribute node `module.Class` → take last identifier
2054        "attribute" => {
2055            if let Some(attr) = node.child_by_field_name("attribute") {
2056                return source[attr.start_byte()..attr.end_byte()].to_string();
2057            }
2058            String::new()
2059        }
2060        _ => {
2061            // Try first identifier child
2062            let mut cursor = node.walk();
2063            for child in node.children(&mut cursor) {
2064                if child.kind() == "identifier" {
2065                    return source[child.start_byte()..child.end_byte()].to_string();
2066                }
2067            }
2068            String::new()
2069        }
2070    }
2071}
2072
2073/// Scan a Python function body for `x = ClassName(...)` assignment patterns.
2074///
2075/// Returns a map from local variable name to constructor type.
2076/// E.g., `x = Foo()` → `{"x": "Foo"}`.
2077/// Also handles `x = module.ClassName(...)` → `{"x": "ClassName"}`.
2078fn extract_python_assignment_types(
2079    source: &str,
2080    fn_node: tree_sitter::Node<'_>,
2081) -> HashMap<String, String> {
2082    let mut bindings: HashMap<String, String> = HashMap::new();
2083    let Some(body) = fn_node.child_by_field_name("body") else {
2084        return bindings;
2085    };
2086
2087    let mut stack = vec![body];
2088    while let Some(n) = stack.pop() {
2089        if n.kind() == "assignment" {
2090            // assignment: left = right
2091            // We want: left is a simple identifier, right is a call whose
2092            // function is an identifier starting with an uppercase letter
2093            // (Python convention for class names).
2094            let left = n.child_by_field_name("left");
2095            let right = n.child_by_field_name("right");
2096            if let (Some(lhs), Some(rhs)) = (left, right)
2097                && lhs.kind() == "identifier"
2098                && rhs.kind() == "call"
2099                && let Some(func) = rhs.child_by_field_name("function")
2100            {
2101                let var_name = source[lhs.start_byte()..lhs.end_byte()].to_string();
2102                let constructor_type = match func.kind() {
2103                    "identifier" => {
2104                        let t = source[func.start_byte()..func.end_byte()].to_string();
2105                        // Class names are conventionally uppercase-first
2106                        if t.chars().next().is_some_and(char::is_uppercase) {
2107                            Some(t)
2108                        } else {
2109                            None
2110                        }
2111                    }
2112                    "attribute" => {
2113                        // `module.ClassName(...)` — take the `attribute` part
2114                        func.child_by_field_name("attribute")
2115                            .map(|a| source[a.start_byte()..a.end_byte()].to_string())
2116                    }
2117                    _ => None,
2118                };
2119                if let Some(ty) = constructor_type {
2120                    bindings.insert(var_name, ty);
2121                }
2122            }
2123        }
2124        let mut cursor = n.walk();
2125        for child in n.children(&mut cursor) {
2126            stack.push(child);
2127        }
2128    }
2129    bindings
2130}
2131
2132/// Walk a Python function body and annotate attribute-call byte offsets.
2133///
2134/// A Python method call is:
2135/// `(call function: (attribute value: <receiver> attribute: (identifier) @callee))`
2136///
2137/// The receiver is the `value` child of `attribute`. This function checks:
2138/// - `self` → use the enclosing class name (`class_ctx`).
2139/// - An identifier matching a parameter type in `param_types`.
2140/// - An identifier matching a constructor assignment in `let_types`.
2141fn annotate_python_method_calls(
2142    source: &str,
2143    fn_node: tree_sitter::Node<'_>,
2144    class_ctx: Option<&str>,
2145    param_types: &HashMap<String, String>,
2146    let_types: &HashMap<String, String>,
2147    map: &mut HashMap<u32, String>,
2148) {
2149    let mut stack = vec![fn_node];
2150    while let Some(n) = stack.pop() {
2151        if n.kind() == "call"
2152            && let Some(func) = n.child_by_field_name("function")
2153            && func.kind() == "attribute"
2154            && let (Some(recv_node), Some(attr_node)) = (
2155                func.child_by_field_name("object"),
2156                func.child_by_field_name("attribute"),
2157            )
2158        {
2159            // attribute node: object (receiver) + attribute (method name)
2160            let recv_text = source[recv_node.start_byte()..recv_node.end_byte()].to_string();
2161            let receiver_type = if recv_text == "self" || recv_text == "cls" {
2162                class_ctx.map(str::to_owned)
2163            } else if recv_node.kind() == "identifier" {
2164                param_types
2165                    .get(&recv_text)
2166                    .or_else(|| let_types.get(&recv_text))
2167                    .cloned()
2168            } else {
2169                None
2170            };
2171
2172            if let Some(ty) = receiver_type {
2173                // The `@callee` capture byte offset is the `attribute` child.
2174                #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
2175                let attr_byte = attr_node.start_byte() as u32;
2176                map.insert(attr_byte, ty);
2177            }
2178        }
2179        let mut cursor = n.walk();
2180        for child in n.children(&mut cursor) {
2181            stack.push(child);
2182        }
2183    }
2184}
2185
2186// ── Python class hierarchy (MRO) extraction ───────────────────────────
2187
2188/// Walk the Python parse tree and add class → parent-names entries to `out`.
2189///
2190/// tree-sitter-python shape:
2191/// ```text
2192/// (class_definition
2193///   name: (identifier) @child
2194///   superclasses: (argument_list
2195///     (identifier) @parent             ; bare parent: class Foo(Bar):
2196///     (attribute attribute: (identifier) @parent) ; qualified: class Foo(mod.Bar):
2197///     (keyword_argument …)             ; ignored: class Foo(Bar, metaclass=Meta):
2198///   )?
2199///   ...)
2200/// ```
2201///
2202/// Each map entry's key is a class defined in the file; the value is the
2203/// ordered list of declared parent class **names** (the trailing `attribute`
2204/// segment, so `mod.Bar` becomes `Bar`). Classes with no declared parents
2205/// still get an entry (empty `Vec`) so a downstream MRO walk can tell
2206/// "known class with no parents" from "unknown class".
2207fn extract_python_class_hierarchy_node(
2208    source: &str,
2209    root: tree_sitter::Node<'_>,
2210    out: &mut HashMap<String, Vec<String>>,
2211) {
2212    let mut stack = vec![root];
2213    while let Some(n) = stack.pop() {
2214        if n.kind() == "class_definition"
2215            && let Some(name_node) = n.child_by_field_name("name")
2216        {
2217            let class_name = source[name_node.start_byte()..name_node.end_byte()].to_string();
2218            let mut parents: Vec<String> = Vec::new();
2219            if let Some(superclasses) = n.child_by_field_name("superclasses") {
2220                // superclasses is an `argument_list`; iterate its children and
2221                // collect identifiers / attribute trailing-names. Skip
2222                // keyword_argument entries (metaclass=…, etc.) and punctuation.
2223                let mut sc = superclasses.walk();
2224                for child in superclasses.children(&mut sc) {
2225                    match child.kind() {
2226                        "identifier" => {
2227                            let t = source[child.start_byte()..child.end_byte()].to_string();
2228                            parents.push(t);
2229                        }
2230                        "attribute" => {
2231                            // module.Cls → take the trailing `attribute` segment.
2232                            if let Some(attr) = child.child_by_field_name("attribute") {
2233                                parents
2234                                    .push(source[attr.start_byte()..attr.end_byte()].to_string());
2235                            }
2236                        }
2237                        // Drop keyword_argument, "(", ")", ",", comments, etc.
2238                        _ => {}
2239                    }
2240                }
2241            }
2242            out.insert(class_name, parents);
2243        }
2244        let mut cursor = n.walk();
2245        for child in n.children(&mut cursor) {
2246            stack.push(child);
2247        }
2248    }
2249}
2250
2251/// Extract the Python `class → [parents]` map from a single source file by
2252/// parsing it with tree-sitter-python.
2253///
2254/// Returns an empty map when the source fails to parse or contains no
2255/// `class_definition` nodes. The returned map is the per-file contribution
2256/// to the global hierarchy used by [`resolve_calls_with_python_mro_pub`]
2257/// for MRO-aware receiver-type dispatch (Q1, Wave 2).
2258#[must_use]
2259pub fn extract_python_class_hierarchy(source: &str) -> HashMap<String, Vec<String>> {
2260    let mut parser = Parser::new();
2261    let lang: tree_sitter::Language = tree_sitter_python::LANGUAGE.into();
2262    if parser.set_language(&lang).is_err() {
2263        return HashMap::new();
2264    }
2265    let Some(tree) = parser.parse(source, None) else {
2266        return HashMap::new();
2267    };
2268    let mut out: HashMap<String, Vec<String>> = HashMap::new();
2269    extract_python_class_hierarchy_node(source, tree.root_node(), &mut out);
2270    out
2271}
2272
2273/// Compute the linearised MRO (Method Resolution Order) for a Python class
2274/// name using a **simplified left-first depth-first walk** of the declared
2275/// `class → [parents]` hierarchy.
2276///
2277/// Python's real MRO uses C3 linearisation, which is monotonic and respects
2278/// declaration order across the diamond inheritance shape. For ripvec's
2279/// reverse-call-graph purpose we want *any plausible ancestor* of the
2280/// receiver type — including ancestors only reachable via a mixin — so we
2281/// can resolve `self.method()` calls whose dispatch lands on an ancestor.
2282/// The simplification: pre-order DFS, left-to-right, skipping cycles via a
2283/// `visited` set.
2284///
2285/// On a non-diamond shape this matches C3 exactly. On a diamond the
2286/// simplified walk may surface an ancestor earlier than C3 would, but every
2287/// ancestor C3 would visit is still reached — and ripvec's goal is "find
2288/// the implementing def for an inherited call", not "compute the runtime
2289/// dispatch winner". Over-approximating ancestors only matters when two
2290/// ancestors define the same method, and even then the left-first order
2291/// matches C3 on the common patterns
2292/// (`class Sub(Base, Mixin)` → `Sub, Base, Mixin, <Base's ancestors>, <Mixin's ancestors>`).
2293///
2294/// The returned list excludes the start class itself. Each entry appears at
2295/// most once even when reachable through multiple parent chains.
2296fn compute_python_mro<H: std::hash::BuildHasher>(
2297    start: &str,
2298    hierarchy: &HashMap<String, Vec<String>, H>,
2299) -> Vec<String> {
2300    use std::collections::HashSet;
2301    let mut order: Vec<String> = Vec::new();
2302    let mut visited: HashSet<String> = HashSet::new();
2303    // Start with the immediate parents of `start` (the receiver-type's own
2304    // scope was already searched by Priority 2's direct match).
2305    let Some(start_parents) = hierarchy.get(start) else {
2306        return order;
2307    };
2308    // DFS stack: we push in reverse so pop yields left-first order.
2309    let mut stack: Vec<String> = start_parents.iter().rev().cloned().collect();
2310    while let Some(cls) = stack.pop() {
2311        if !visited.insert(cls.clone()) {
2312            continue;
2313        }
2314        order.push(cls.clone());
2315        if let Some(parents) = hierarchy.get(&cls) {
2316            for p in parents.iter().rev() {
2317                if !visited.contains(p) {
2318                    stack.push(p.clone());
2319                }
2320            }
2321        }
2322    }
2323    order
2324}
2325
2326// ── Go receiver-type heuristic ────────────────────────────────────────
2327
2328/// Walk the Go parse tree and fill `map` with receiver-type inference.
2329///
2330/// One heuristic case: **`recv.Method()` inside a `method_declaration`**.
2331///
2332/// Go methods have an explicit receiver parameter in their signature:
2333/// `func (r *Foo) Bar() { r.Baz() }` — `r` is bound to type `Foo`.
2334///
2335/// The Go call query captures:
2336/// `(call_expression function: (selector_expression field: (field_identifier) @callee))`
2337///
2338/// Within `selector_expression`, `operand` is the receiver expression and
2339/// `field` is the method name (the `@callee` capture).
2340///
2341/// This function also handles `self.method()` patterns for cases where code
2342/// uses `self` as a receiver name (not idiomatic Go, but it occurs).
2343fn collect_go_receiver_types(
2344    source: &str,
2345    root: tree_sitter::Node<'_>,
2346    map: &mut HashMap<u32, String>,
2347) {
2348    // Stack carries (node, receiver_binding: Option<(recv_name, recv_type)>).
2349    let mut stack: Vec<(tree_sitter::Node<'_>, Option<(String, String)>)> = vec![(root, None)];
2350
2351    while let Some((n, recv_binding)) = stack.pop() {
2352        if n.kind() == "method_declaration" {
2353            // Extract the receiver name and type from the method signature.
2354            let binding = extract_go_receiver_binding(source, n);
2355            let new_binding = binding.or_else(|| recv_binding.clone());
2356            let mut cursor = n.walk();
2357            for child in n.children(&mut cursor) {
2358                stack.push((child, new_binding.clone()));
2359            }
2360        } else {
2361            // For any call_expression whose function is a selector_expression,
2362            // check if the operand matches the active receiver binding.
2363            if n.kind() == "call_expression"
2364                && let Some(func) = n.child_by_field_name("function")
2365                && func.kind() == "selector_expression"
2366                && let (Some(operand), Some(field)) = (
2367                    func.child_by_field_name("operand"),
2368                    func.child_by_field_name("field"),
2369                )
2370            {
2371                let recv_text = source[operand.start_byte()..operand.end_byte()].to_string();
2372                let receiver_type = recv_binding.as_ref().and_then(|(recv_name, recv_ty)| {
2373                    if recv_text == *recv_name {
2374                        Some(recv_ty.clone())
2375                    } else {
2376                        None
2377                    }
2378                });
2379
2380                if let Some(ty) = receiver_type {
2381                    // The `@callee` capture byte offset is the `field` child.
2382                    #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
2383                    let field_byte = field.start_byte() as u32;
2384                    map.insert(field_byte, ty);
2385                }
2386            }
2387
2388            let mut cursor = n.walk();
2389            for child in n.children(&mut cursor) {
2390                stack.push((child, recv_binding.clone()));
2391            }
2392        }
2393    }
2394}
2395
2396/// Extract the receiver name and base type from a Go `method_declaration`.
2397///
2398/// Go method declaration shape (tree-sitter-go):
2399/// ```text
2400/// (method_declaration
2401///   receiver: (parameter_list
2402///     (parameter_declaration
2403///       name: (identifier)       ← receiver name
2404///       type: (type_identifier   ← receiver type (bare)
2405///            | pointer_type (type_identifier)) ← or *Type
2406///     )
2407///   )
2408///   name: (field_identifier)
2409///   ...
2410/// )
2411/// ```
2412///
2413/// Returns `Some((recv_name, type_name))` or `None` if the receiver is unnamed
2414/// (blank identifier `_`) or has an unrecognisable shape.
2415fn extract_go_receiver_binding(
2416    source: &str,
2417    method_node: tree_sitter::Node<'_>,
2418) -> Option<(String, String)> {
2419    let receiver_list = method_node.child_by_field_name("receiver")?;
2420    // parameter_list contains one parameter_declaration
2421    let mut cursor = receiver_list.walk();
2422    for param in receiver_list.children(&mut cursor) {
2423        if param.kind() == "parameter_declaration" {
2424            let name_node = param.child_by_field_name("name");
2425            let type_node = param.child_by_field_name("type");
2426            if let (Some(nn), Some(tn)) = (name_node, type_node) {
2427                let name = source[nn.start_byte()..nn.end_byte()].to_string();
2428                if name == "_" || name.is_empty() {
2429                    return None;
2430                }
2431                let ty = extract_go_base_type(source, tn);
2432                if !ty.is_empty() {
2433                    return Some((name, ty));
2434                }
2435            }
2436        }
2437    }
2438    None
2439}
2440
2441/// Extract the base type name from a Go type node.
2442///
2443/// For `Foo` (type_identifier) → `"Foo"`.
2444/// For `*Foo` (pointer_type → type_identifier) → `"Foo"`.
2445fn extract_go_base_type(source: &str, node: tree_sitter::Node<'_>) -> String {
2446    match node.kind() {
2447        "type_identifier" => source[node.start_byte()..node.end_byte()].to_string(),
2448        "pointer_type" => {
2449            // pointer_type has one child: the pointee type
2450            let mut cursor = node.walk();
2451            for child in node.children(&mut cursor) {
2452                if child.kind() == "type_identifier" {
2453                    return source[child.start_byte()..child.end_byte()].to_string();
2454                }
2455                let t = extract_go_base_type(source, child);
2456                if !t.is_empty() {
2457                    return t;
2458                }
2459            }
2460            String::new()
2461        }
2462        _ => {
2463            let mut cursor = node.walk();
2464            for child in node.children(&mut cursor) {
2465                if child.kind() == "type_identifier" {
2466                    return source[child.start_byte()..child.end_byte()].to_string();
2467                }
2468            }
2469            String::new()
2470        }
2471    }
2472}
2473
2474/// Enrich Go `method_declaration` definition scopes with their receiver type name.
2475///
2476/// In the generic `extract_definitions` path, `build_scope_chain` walks the
2477/// *parent* chain of the `@def` node. For Go `method_declaration`, the parent
2478/// is the file root — so the scope is always `""`.
2479///
2480/// An empty scope means `resolve_calls` Priority 2 (receiver-type matching via
2481/// `scope.contains(recv_type)`) never fires for Go methods. Cross-file calls
2482/// where the caller inferred `receiver_type = Some("Foo")` stay unresolved;
2483/// no edge is recorded; `def_callers[]` stays empty for those defs — the root
2484/// cause of the missing inverse index for Go (I#P1).
2485///
2486/// Fix: after `extract_definitions`, parse the Go source a second time to find
2487/// each `method_declaration`'s receiver type, then set the matching def's scope
2488/// to `"method_declaration {ReceiverType}"`.  This matches the pattern used by
2489/// the existing `go_resolve_receiver_method_via_signature` integration test,
2490/// which asserts that `scope.contains("Foo")` succeeds when the scope is
2491/// `"method_declaration Foo"`.
2492///
2493/// Matching is by `start_byte` (precise) so name collisions across different
2494/// receiver types are handled correctly.
2495fn enrich_go_method_def_scopes(source: &str, defs: &mut [Definition]) {
2496    let go_lang: tree_sitter::Language = tree_sitter_go::LANGUAGE.into();
2497    let mut parser = Parser::new();
2498    if parser.set_language(&go_lang).is_err() {
2499        return;
2500    }
2501    let Some(tree) = parser.parse(source, None) else {
2502        return;
2503    };
2504
2505    // Walk all top-level method_declaration nodes.
2506    let root = tree.root_node();
2507    let mut method_cursor = root.walk();
2508    for child in root.children(&mut method_cursor) {
2509        if child.kind() != "method_declaration" {
2510            continue;
2511        }
2512        let Some((_, recv_type)) = extract_go_receiver_binding(source, child) else {
2513            continue;
2514        };
2515        // Match by start_byte (precise): the @def node for method_declaration in
2516        // the Go definition query is the method_declaration node itself, so its
2517        // start_byte matches the def's start_byte recorded during extract_definitions.
2518        #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
2519        let method_start_byte = child.start_byte() as u32;
2520        for def in defs.iter_mut() {
2521            if def.kind == "method_declaration" && def.start_byte == method_start_byte {
2522                def.scope = format!("method_declaration {recv_type}");
2523                break;
2524            }
2525        }
2526    }
2527}
2528
2529/// Public wrapper for `enrich_go_method_def_scopes` — enables integration tests
2530/// to call it directly without going through the full `build_graph` pipeline.
2531pub fn enrich_go_method_def_scopes_pub(source: &str, defs: &mut [Definition]) {
2532    enrich_go_method_def_scopes(source, defs);
2533}
2534
2535/// SQL: prepend a synthetic whole-file definition whose name is the filename stem.
2536///
2537/// dbt and sqlmesh follow a filename-as-model-name convention:
2538///
2539/// - `silver_issuer_returns.sql` defines the `silver_issuer_returns` model.
2540/// - `gold_issuer_returns.sql` references the silver model by filename stem,
2541///   not by any in-source CREATE TABLE.
2542///
2543/// In sqlmesh, the in-source name is templated:
2544///
2545/// ```sql
2546/// MODEL (
2547///   name @{athena_sqlmesh_silver_schema}.issuer_returns,
2548///   ...
2549/// );
2550/// SELECT ... FROM @{athena_sqlmesh_silver_schema}.stg_issuer_returns;
2551/// ```
2552///
2553/// The `MODEL (...)` header parses as an ERROR node under tree-sitter-sequel
2554/// because `@{var}` is not standard SQL; FROM/JOIN further down the file still
2555/// extract cleanly. Without a synthetic def, `lsp_workspace_symbols(query=
2556/// "silver_issuer_returns")` returns no hits — there is no real CREATE TABLE
2557/// in the file and the model name is interpolation only.
2558///
2559/// This helper prepends a definition with:
2560/// - `name` = filename stem (e.g., `silver_issuer_returns`)
2561/// - `kind` = `"sql_file"` (maps to `LSP SymbolKind::File` in
2562///   [`languages::lsp_symbol_kind_for_node_kind`])
2563/// - byte range = the entire source `[0, source.len())`
2564/// - scope / signature / qualified_name = empty / None
2565///
2566/// The whole-file byte range is the key to FROM/JOIN attribution: when
2567/// `extract_calls` later places a CallRef from a FROM clause that is not
2568/// inside any CTE or other smaller def, the smallest-enclosing-def search
2569/// lands on this synthetic file def and the edge is recorded.
2570///
2571/// If the filename has no stem (empty / `..`), the helper is a no-op.
2572/// Idempotent: if a `sql_file` def already exists at byte 0, it is left alone.
2573pub(crate) fn enrich_sql_file_def(filename: &str, source: &str, defs: &mut Vec<Definition>) {
2574    // Idempotency: do nothing if a sql_file def is already present at byte 0.
2575    if defs
2576        .iter()
2577        .any(|d| d.kind == "sql_file" && d.start_byte == 0)
2578    {
2579        return;
2580    }
2581
2582    // Derive the filename stem (last path component, file extension stripped).
2583    let stem = std::path::Path::new(filename)
2584        .file_stem()
2585        .and_then(|s| s.to_str())
2586        .unwrap_or_default();
2587    if stem.is_empty() {
2588        return;
2589    }
2590
2591    // Count newlines so end_line is reasonable for downstream UI.
2592    let end_line_zero_based = source.bytes().filter(|&b| b == b'\n').count();
2593    #[expect(clippy::cast_possible_truncation, reason = "line counts fit in u32")]
2594    let end_line = (end_line_zero_based as u32) + 1;
2595    #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
2596    let end_byte = source.len() as u32;
2597
2598    let file_def = Definition {
2599        name: stem.to_string(),
2600        kind: "sql_file".to_string(),
2601        start_line: 1,
2602        end_line,
2603        scope: String::new(),
2604        signature: None,
2605        start_byte: 0,
2606        end_byte,
2607        calls: vec![],
2608        decorator: None,
2609        lsp_kind_hint: None,
2610    };
2611    // Prepend so it remains the outermost (largest) enclosing def at byte 0,
2612    // ensuring narrow CTE defs are still preferred for inner-FROM attribution
2613    // (the smallest-enclosing rule in `extract_calls`).
2614    defs.insert(0, file_def);
2615}
2616
2617/// Public wrapper for [`enrich_sql_file_def`] — enables integration tests
2618/// to call it directly without going through the full `build_graph` pipeline.
2619///
2620/// This is `pub` (not `pub(crate)`) because integration tests in
2621/// `crates/ripvec-core/tests/` are in a separate crate and cannot access
2622/// `pub(crate)` items.
2623pub fn enrich_sql_file_def_pub(filename: &str, source: &str, defs: &mut Vec<Definition>) {
2624    enrich_sql_file_def(filename, source, defs);
2625}
2626
2627/// Public wrapper for `extract_calls` — enables integration tests to call it
2628/// directly without going through the full `build_graph` pipeline.
2629///
2630/// This is `pub` (not `pub(crate)`) because integration tests in
2631/// `crates/ripvec-core/tests/` are in a separate crate and cannot access
2632/// `pub(crate)` items.
2633pub fn extract_calls_pub(
2634    source: &str,
2635    call_config: &languages::CallConfig,
2636    defs: &mut [Definition],
2637) {
2638    extract_calls(source, call_config, defs);
2639}
2640
2641/// Public wrapper for [`extract_definitions`] — enables integration tests in
2642/// `crates/ripvec-core/tests/` to drive Python decorator detection (C1, 4.1.1).
2643///
2644/// Returns the [`Definition`] list for `source` parsed as the language
2645/// identified by `lang_config`.
2646#[must_use]
2647pub fn extract_definitions_pub(
2648    source: &str,
2649    lang_config: &languages::LangConfig,
2650) -> Vec<Definition> {
2651    extract_definitions(source, lang_config)
2652}
2653
2654/// Build an index from definition name to list of `DefId`s.
2655#[must_use]
2656pub fn build_def_index_pub(files: &[FileNode]) -> HashMap<String, Vec<DefId>> {
2657    build_def_index(files)
2658}
2659
2660fn build_def_index(files: &[FileNode]) -> HashMap<String, Vec<DefId>> {
2661    let mut index: HashMap<String, Vec<DefId>> = HashMap::new();
2662    for (file_idx, file) in files.iter().enumerate() {
2663        for (def_idx, def) in file.defs.iter().enumerate() {
2664            #[expect(clippy::cast_possible_truncation)]
2665            let did: DefId = (file_idx as u32, def_idx as u16);
2666            index.entry(def.name.clone()).or_default().push(did);
2667        }
2668    }
2669    index
2670}
2671
2672/// Resolve call references to target definitions.
2673///
2674/// Resolution priority:
2675///
2676/// 1. **Qualified path** (`qualified_path = Some("mod_a::foo")`): filter candidates
2677///    by qualifier match (file path or scope contains the qualifier segment). Unique
2678///    match → resolve; ambiguous or no match → leave `None`.
2679/// 2. **Receiver type — direct scope match** (`receiver_type = Some("Foo")`):
2680///    for method calls, prefer candidates whose `scope` contains the receiver
2681///    type name (e.g., `"impl_item Foo"`). Among receiver-matching candidates,
2682///    further prefer those in imported files. Unique match → resolve;
2683///    ambiguous → leave `None`.
2684///    When this step finds nothing on the receiver class itself, sub-step 2b
2685///    (Python MRO walk) runs: when `Foo` has a recorded parent chain, walk
2686///    the receiver class's MRO (left-first DFS) and try the scope-match
2687///    against each ancestor's name. First ancestor with a matching candidate
2688///    wins. See [`compute_python_mro`] for the simplification rationale (it
2689///    diverges from C3 only on diamond shapes where two ancestors define the
2690///    same name).
2691/// 3. **Same file** (unqualified, no receiver): prefer definitions in the caller's
2692///    own file.
2693/// 4. **SQL suffix-match** (sql_file callers only, no exact-name match): when
2694///    the caller def has `kind = "sql_file"` and the bare `call_name` (e.g.,
2695///    `"issuer_returns"`) has no exact entry in the def index, scan all
2696///    `sql_file` defs for names ending with `_<call_name>` (e.g.,
2697///    `"silver_issuer_returns"`). This bridges dbt / sqlmesh layered schema
2698///    prefixes: `gold_issuer_returns.sql` uses `FROM @{schema}.issuer_returns`
2699///    which tree-sitter reduces to the bare name `"issuer_returns"`, while the
2700///    target def is the synthetic `sql_file` def named `"silver_issuer_returns"`.
2701///    Unique suffix-match → resolve; ambiguous (multiple layers match) or
2702///    no match → leave `None`. Non-sql_file callers are explicitly excluded.
2703/// 5. **Imported file** (unqualified, no receiver): check definitions in files this
2704///    file imports. Unique imported candidate → resolve.
2705/// 6. **Global-unique fallback**: when a bare call name maps to exactly one def in the
2706///    entire graph — regardless of file or import relationship — resolve to it.
2707///    Handles trait-method dispatch (`Trait::method` called as bare `method`) and
2708///    struct constructors referenced across non-imported module boundaries.
2709///    Only fires when Priorities 1–5 left the call unresolved and exactly one
2710///    candidate exists. Ambiguous (>1 candidates) → leave `None`.
2711/// 7. **Ambiguous or unresolved**: leave `resolved` as `None` (no silent first-wins).
2712///
2713/// Equivalent to [`resolve_calls_with_python_mro_pub`] with an empty MRO map
2714/// (Priority 2.5 is a no-op).
2715pub fn resolve_calls_pub<S: std::hash::BuildHasher>(
2716    files: &mut [FileNode],
2717    def_index: &HashMap<String, Vec<DefId>, S>,
2718) {
2719    let empty: HashMap<String, Vec<String>> = HashMap::new();
2720    resolve_calls_inner(files, def_index, &empty);
2721}
2722
2723/// Resolve call references with MRO-aware Python receiver dispatch enabled.
2724///
2725/// Identical to [`resolve_calls_pub`] except that Priority 2.5 (the MRO walk)
2726/// fires when the caller passes a non-empty `python_class_hierarchy`.
2727/// `build_graph` populates the hierarchy by parsing every Python source file
2728/// with [`extract_python_class_hierarchy`] and merging the per-file maps.
2729///
2730/// Tests that want to exercise MRO resolution without going through
2731/// `build_graph` can call this directly with a synthetic hierarchy.
2732pub fn resolve_calls_with_python_mro_pub<S, H>(
2733    files: &mut [FileNode],
2734    def_index: &HashMap<String, Vec<DefId>, S>,
2735    python_class_hierarchy: &HashMap<String, Vec<String>, H>,
2736) where
2737    S: std::hash::BuildHasher,
2738    H: std::hash::BuildHasher,
2739{
2740    resolve_calls_inner(files, def_index, python_class_hierarchy);
2741}
2742
2743fn resolve_calls<S, H>(
2744    files: &mut [FileNode],
2745    def_index: &HashMap<String, Vec<DefId>, S>,
2746    python_class_hierarchy: &HashMap<String, Vec<String>, H>,
2747) where
2748    S: std::hash::BuildHasher,
2749    H: std::hash::BuildHasher,
2750{
2751    resolve_calls_inner(files, def_index, python_class_hierarchy);
2752}
2753
2754#[expect(
2755    clippy::too_many_lines,
2756    reason = "8-priority resolution cascade (qualified path, receiver type, MRO walk, same-file, \
2757              SQL suffix-match, imported-file, global-unique, ambiguous); each priority is a \
2758              distinct decision branch and extracting helpers would require passing large shared \
2759              state across boundaries"
2760)]
2761fn resolve_calls_inner<S, H>(
2762    files: &mut [FileNode],
2763    def_index: &HashMap<String, Vec<DefId>, S>,
2764    python_class_hierarchy: &HashMap<String, Vec<String>, H>,
2765) where
2766    S: std::hash::BuildHasher,
2767    H: std::hash::BuildHasher,
2768{
2769    // Pre-compute imported file sets for each file.
2770    let imported_files: Vec<std::collections::HashSet<u32>> = files
2771        .iter()
2772        .map(|f| {
2773            f.imports
2774                .iter()
2775                .filter_map(|imp| imp.resolved_idx)
2776                .collect()
2777        })
2778        .collect();
2779
2780    for file_idx in 0..files.len() {
2781        for def_idx in 0..files[file_idx].defs.len() {
2782            for call_idx in 0..files[file_idx].defs[def_idx].calls.len() {
2783                let call_name = files[file_idx].defs[def_idx].calls[call_idx].name.clone();
2784                let qualified_path = files[file_idx].defs[def_idx].calls[call_idx]
2785                    .qualified_path
2786                    .clone();
2787                let receiver_type = files[file_idx].defs[def_idx].calls[call_idx]
2788                    .receiver_type
2789                    .clone();
2790
2791                // HCL: dedicated resolution for `terraform_remote_state.<NAME>`
2792                // and `module.<NAME>` qualified paths. Aurora's module DAG is
2793                // expressed by these patterns; resolve to the first def in any
2794                // file under a `/<NAME>/` directory segment. This is the
2795                // module-source contract (R2 + R3, Wave 3).
2796                if let Some(ref qpath) = qualified_path
2797                    && (qpath.starts_with("terraform_remote_state.")
2798                        || qpath.starts_with("module."))
2799                {
2800                    let target = &call_name; // already the bare module label
2801                    let segment_match = format!("/{target}/");
2802                    let alt_segment_prefix = format!("{target}/"); // when path starts with target dir
2803                    let candidate = files.iter().enumerate().find_map(|(idx, f)| {
2804                        if f.path.contains(&segment_match)
2805                            || f.path.starts_with(&alt_segment_prefix)
2806                        {
2807                            // Pick the first def in the file (or skip if file
2808                            // has no defs).
2809                            if !f.defs.is_empty() {
2810                                #[expect(
2811                                    clippy::cast_possible_truncation,
2812                                    reason = "file index fits in u32"
2813                                )]
2814                                {
2815                                    return Some((idx as u32, 0u16));
2816                                }
2817                            }
2818                        }
2819                        None
2820                    });
2821                    if let Some(did) = candidate {
2822                        files[file_idx].defs[def_idx].calls[call_idx].resolved = Some(did);
2823                    }
2824                    continue;
2825                }
2826
2827                // ── Priority 1: Qualified-path resolution ────────────────
2828                //
2829                // `qualified_path` carries the full scoped path (e.g. "mod_a::foo").
2830                // We look up candidates by the bare `call_name`, then filter by
2831                // whether the file path or scope contains the qualifier prefix.
2832                if let Some(ref qpath) = qualified_path {
2833                    // Qualifier is everything before the final `::`.
2834                    let qualifier = if let Some(pos) = qpath.rfind("::") {
2835                        &qpath[..pos]
2836                    } else {
2837                        qpath.as_str()
2838                    };
2839                    let qual_segments: Vec<&str> = qualifier.split("::").collect();
2840
2841                    let Some(candidates) = def_index.get(&call_name) else {
2842                        continue;
2843                    };
2844
2845                    let matching: Vec<DefId> = candidates
2846                        .iter()
2847                        .copied()
2848                        .filter(|&(f_idx, _)| {
2849                            let file_path = &files[f_idx as usize].path;
2850                            let last_segment = qual_segments.last().copied().unwrap_or("");
2851                            let path_as_module =
2852                                file_path.trim_end_matches(".rs").replace(['/', '\\'], "::");
2853                            path_as_module.contains(last_segment)
2854                                || file_path.contains(last_segment)
2855                        })
2856                        .collect();
2857
2858                    if matching.len() == 1 {
2859                        files[file_idx].defs[def_idx].calls[call_idx].resolved = Some(matching[0]);
2860                    }
2861                    // Ambiguous or no match → leave None.
2862                    continue;
2863                }
2864
2865                // ── Priority 3.5: SQL suffix-match resolution ────────────
2866                //
2867                // dbt / sqlmesh pipelines use layered schema prefixes:
2868                // `silver_issuer_returns.sql` defines the silver-layer model.
2869                // `gold_issuer_returns.sql` references it via a FROM clause
2870                // that tree-sitter parses as `name: "issuer_returns"` (the
2871                // `name:` field-selector strips the `@{schema}.` prefix).
2872                // `def_index.get("issuer_returns")` returns None — the def is
2873                // stored under "silver_issuer_returns".
2874                //
2875                // When: (a) no exact-name candidate exists, AND (b) the
2876                // caller's enclosing def is a sql_file (whole-file synthetic
2877                // def emitted by `enrich_sql_file_def`), walk every sql_file
2878                // def in the graph and check whether its name ends with
2879                // `_<call_name>`. Unique suffix-match → resolve. Ambiguous
2880                // (e.g., both gold_ and silver_ match the same bare name) →
2881                // leave None (no silent first-wins).
2882                //
2883                // Non-sql_file callers are explicitly excluded: a Rust
2884                // function_item whose call_name happens to end with a suffix
2885                // of some sql_file def must NOT be resolved via this path.
2886                if !def_index.contains_key(&call_name)
2887                    && files[file_idx].defs[def_idx].kind == "sql_file"
2888                    && !call_name.is_empty()
2889                {
2890                    let suffix = format!("_{call_name}");
2891                    let suffix_str = suffix.as_str();
2892                    // Exclude the caller def itself from the suffix scan:
2893                    // `gold_issuer_returns` also ends with `_issuer_returns`
2894                    // but must not self-resolve.
2895                    #[expect(clippy::cast_possible_truncation, reason = "file index fits in u32")]
2896                    let caller_did: DefId = (file_idx as u32, def_idx as u16);
2897                    let suffix_matches: Vec<DefId> = files
2898                        .iter()
2899                        .enumerate()
2900                        .flat_map(|(f_idx, f)| {
2901                            f.defs.iter().enumerate().filter_map(move |(d_idx, d)| {
2902                                #[expect(
2903                                    clippy::cast_possible_truncation,
2904                                    reason = "file and def indices fit in u32/u16"
2905                                )]
2906                                let did: DefId = (f_idx as u32, d_idx as u16);
2907                                if d.kind == "sql_file"
2908                                    && d.name.ends_with(suffix_str)
2909                                    && did != caller_did
2910                                {
2911                                    Some(did)
2912                                } else {
2913                                    None
2914                                }
2915                            })
2916                        })
2917                        .collect();
2918                    if suffix_matches.len() == 1 {
2919                        files[file_idx].defs[def_idx].calls[call_idx].resolved =
2920                            Some(suffix_matches[0]);
2921                    }
2922                    // Ambiguous (>1) or no match (0) → leave None.
2923                    continue;
2924                }
2925
2926                let Some(candidates) = def_index.get(&call_name) else {
2927                    continue;
2928                };
2929
2930                // ── Priority 2: Receiver-type resolution ─────────────────
2931                //
2932                // `receiver_type = Some("Foo")` means this is a method call on a
2933                // value whose type is `Foo`. Filter candidates to those whose scope
2934                // chain contains the receiver type name.
2935                if let Some(ref rtype) = receiver_type {
2936                    // Candidates whose scope contains the receiver type name.
2937                    let receiver_matching: Vec<DefId> = candidates
2938                        .iter()
2939                        .copied()
2940                        .filter(|&(f_idx, d_idx)| {
2941                            let scope = &files[f_idx as usize].defs[d_idx as usize].scope;
2942                            scope.contains(rtype.as_str())
2943                        })
2944                        .collect();
2945
2946                    if receiver_matching.len() == 1 {
2947                        files[file_idx].defs[def_idx].calls[call_idx].resolved =
2948                            Some(receiver_matching[0]);
2949                        continue;
2950                    }
2951
2952                    if receiver_matching.len() > 1 {
2953                        // Among receiver-matching candidates, prefer those in imported files.
2954                        let imported_receiver_matching: Vec<DefId> = receiver_matching
2955                            .iter()
2956                            .copied()
2957                            .filter(|(f, _)| imported_files[file_idx].contains(f))
2958                            .collect();
2959                        if imported_receiver_matching.len() == 1 {
2960                            files[file_idx].defs[def_idx].calls[call_idx].resolved =
2961                                Some(imported_receiver_matching[0]);
2962                        }
2963                        // Ambiguous even after import filter → leave None.
2964                        continue;
2965                    }
2966
2967                    // ── Priority 2.5: Python MRO walk ─────────────────────
2968                    //
2969                    // The receiver type's own scope has no matching def — but
2970                    // the method may live on an ancestor class. Walk the MRO
2971                    // (left-first DFS) and try the scope-match against each
2972                    // ancestor's name. First ancestor with at least one
2973                    // scope-matching candidate wins; if multiple candidates
2974                    // match for the same ancestor, prefer imported files,
2975                    // else take the first in stable order.
2976                    //
2977                    // Liskov: a subclass's `self.method()` call must dispatch
2978                    // through the MRO; over-approximating ancestors is the
2979                    // correct conservative move for a reverse call graph.
2980                    // For non-Python languages or Python receivers with no
2981                    // recorded parents, `compute_python_mro` returns an
2982                    // empty vector and this loop is a no-op.
2983                    let mro = compute_python_mro(rtype, python_class_hierarchy);
2984                    let mut resolved_via_mro: Option<DefId> = None;
2985                    for ancestor in &mro {
2986                        let ancestor_matching: Vec<DefId> = candidates
2987                            .iter()
2988                            .copied()
2989                            .filter(|&(f_idx, d_idx)| {
2990                                let scope = &files[f_idx as usize].defs[d_idx as usize].scope;
2991                                scope.contains(ancestor.as_str())
2992                            })
2993                            .collect();
2994                        if ancestor_matching.len() == 1 {
2995                            resolved_via_mro = Some(ancestor_matching[0]);
2996                            break;
2997                        }
2998                        if ancestor_matching.len() > 1 {
2999                            // Prefer imported files among the ancestor matches.
3000                            let imported_ancestor: Vec<DefId> = ancestor_matching
3001                                .iter()
3002                                .copied()
3003                                .filter(|(f, _)| imported_files[file_idx].contains(f))
3004                                .collect();
3005                            if imported_ancestor.len() == 1 {
3006                                resolved_via_mro = Some(imported_ancestor[0]);
3007                                break;
3008                            }
3009                            // Ambiguous at this ancestor — pick the first
3010                            // candidate in stable order. The MRO walk's job
3011                            // is to find *an* implementing def for an
3012                            // inherited call, not to compute the runtime
3013                            // dispatch winner; any plausible candidate is
3014                            // useful for the reverse call graph.
3015                            resolved_via_mro = Some(ancestor_matching[0]);
3016                            break;
3017                        }
3018                    }
3019                    if let Some(did) = resolved_via_mro {
3020                        files[file_idx].defs[def_idx].calls[call_idx].resolved = Some(did);
3021                        continue;
3022                    }
3023                    // No receiver-matching candidates anywhere in the MRO →
3024                    // fall through to bare-name resolution.
3025                }
3026
3027                // ── Priority 3: Same-file resolution ─────────────────────
3028                #[expect(clippy::cast_possible_truncation)]
3029                let file_idx_u32 = file_idx as u32;
3030                if let Some(&did) = candidates.iter().find(|(f, _)| *f == file_idx_u32) {
3031                    files[file_idx].defs[def_idx].calls[call_idx].resolved = Some(did);
3032                    continue;
3033                }
3034
3035                // ── Priority 4: Imported-file resolution ──────────────────
3036                let imported_candidates: Vec<DefId> = candidates
3037                    .iter()
3038                    .copied()
3039                    .filter(|(f, _)| imported_files[file_idx].contains(f))
3040                    .collect();
3041                if imported_candidates.len() == 1 {
3042                    files[file_idx].defs[def_idx].calls[call_idx].resolved =
3043                        Some(imported_candidates[0]);
3044                }
3045
3046                // ── Priority 6: Global-unique fallback ────────────────────────────
3047                //
3048                // When a bare call name maps to exactly one def in the entire graph
3049                // — regardless of file or import relationship — resolve to it. This
3050                // bridges trait-method dispatch (`Trait::method` called as bare
3051                // `method`) and constructors referenced across non-imported module
3052                // boundaries.
3053                //
3054                // Guard with `.is_none()` so we do not overwrite a Priority 5
3055                // (imported-file) resolution that already fired.
3056                if candidates.len() == 1
3057                    && files[file_idx].defs[def_idx].calls[call_idx]
3058                        .resolved
3059                        .is_none()
3060                {
3061                    files[file_idx].defs[def_idx].calls[call_idx].resolved = Some(candidates[0]);
3062                }
3063                // Priority 7: Ambiguous or unresolved → leave None.
3064            }
3065        }
3066    }
3067}
3068
3069/// Compute a prefix-sum offset table for flattening `DefId`s to linear indices.
3070fn def_offsets(files: &[FileNode]) -> Vec<usize> {
3071    let mut offsets = Vec::with_capacity(files.len() + 1);
3072    offsets.push(0);
3073    for file in files {
3074        offsets.push(offsets.last().unwrap() + file.defs.len());
3075    }
3076    offsets
3077}
3078
3079/// Flatten a `DefId` to a linear index using the offset table.
3080fn flatten_def_id(offsets: &[usize], did: DefId) -> usize {
3081    offsets[did.0 as usize] + did.1 as usize
3082}
3083
3084/// Build top-N caller and callee lists for each definition (flattened).
3085fn build_def_neighbor_lists(
3086    n: usize,
3087    edges: &[(u32, u32, u32)],
3088    offsets: &[usize],
3089) -> (Vec<Vec<DefId>>, Vec<Vec<DefId>>) {
3090    let mut incoming: Vec<Vec<(u32, u32)>> = vec![vec![]; n];
3091    let mut outgoing: Vec<Vec<(u32, u32)>> = vec![vec![]; n];
3092
3093    for &(src, dst, w) in edges {
3094        let (s, d) = (src as usize, dst as usize);
3095        if s < n && d < n {
3096            incoming[d].push((src, w));
3097            outgoing[s].push((dst, w));
3098        }
3099    }
3100
3101    // Convert flat index back to DefId
3102    let to_def_id = |flat: u32| -> DefId {
3103        let flat_usize = flat as usize;
3104        let file_idx = offsets.partition_point(|&o| o <= flat_usize) - 1;
3105        let def_idx = flat_usize - offsets[file_idx];
3106        #[expect(clippy::cast_possible_truncation)]
3107        (file_idx as u32, def_idx as u16)
3108    };
3109
3110    let callers = incoming
3111        .into_iter()
3112        .map(|mut v| {
3113            v.sort_by_key(|b| std::cmp::Reverse(b.1));
3114            v.truncate(MAX_NEIGHBORS);
3115            v.into_iter().map(|(idx, _)| to_def_id(idx)).collect()
3116        })
3117        .collect();
3118
3119    let callees = outgoing
3120        .into_iter()
3121        .map(|mut v| {
3122            v.sort_by_key(|b| std::cmp::Reverse(b.1));
3123            v.truncate(MAX_NEIGHBORS);
3124            v.into_iter().map(|(idx, _)| to_def_id(idx)).collect()
3125        })
3126        .collect();
3127
3128    (callers, callees)
3129}
3130
3131// ── PageRank ─────────────────────────────────────────────────────────
3132
3133/// Compute `PageRank` scores for a graph.
3134///
3135/// If `focus` is `Some(idx)`, computes topic-sensitive `PageRank` biased
3136/// toward file `idx`. Otherwise computes standard (uniform) `PageRank`.
3137///
3138/// Returns one score per node, summing to 1.0.
3139#[expect(
3140    clippy::cast_precision_loss,
3141    reason = "node count fits comfortably in f32"
3142)]
3143fn pagerank(n: usize, edges: &[(u32, u32, u32)], focus: Option<usize>) -> Vec<f32> {
3144    if n == 0 {
3145        return vec![];
3146    }
3147
3148    // Build adjacency: out_edges[src] = [(dst, weight)]
3149    let mut out_edges: Vec<Vec<(usize, f32)>> = vec![vec![]; n];
3150    let mut out_weight: Vec<f32> = vec![0.0; n];
3151
3152    for &(src, dst, w) in edges {
3153        let (s, d) = (src as usize, dst as usize);
3154        if s < n && d < n {
3155            #[expect(clippy::cast_possible_truncation, reason = "edge weights are small")]
3156            let wf = f64::from(w) as f32;
3157            out_edges[s].push((d, wf));
3158            out_weight[s] += wf;
3159        }
3160    }
3161
3162    // Personalization vector (Haveliwala 2002, topic-sensitive PageRank).
3163    //
3164    // When a focus file is specified, the teleportation distribution is split:
3165    //   - PERSONALIZATION_ALPHA (0.15) concentrated on the focus node.
3166    //   - (1 - PERSONALIZATION_ALPHA) = 0.85 spread uniformly over the
3167    //     remaining (n - 1) other nodes.
3168    //
3169    // This gives the focus file a gentle bias over its neighbors without
3170    // collapsing every other file to an equal uniform floor. The resulting
3171    // ranks still vary across the corpus, so the caller sees a *neighborhood*
3172    // of semantically related files rebiased toward the focus (I#16 fix).
3173    //
3174    // For n == 1 there are no other nodes; the focus gets all mass (= 1.0).
3175    let bias: Vec<f32> = if let Some(idx) = focus {
3176        if n == 1 {
3177            vec![1.0_f32]
3178        } else {
3179            let other_mass = (1.0_f32 - PERSONALIZATION_ALPHA) / (n as f32 - 1.0);
3180            let mut b = vec![other_mass; n];
3181            if idx < n {
3182                b[idx] = PERSONALIZATION_ALPHA;
3183            }
3184            // Verify sum ≈ 1.0 (should hold by construction; normalization
3185            // guards against floating-point drift on very large graphs).
3186            let sum: f32 = b.iter().sum();
3187            for v in &mut b {
3188                *v /= sum;
3189            }
3190            b
3191        }
3192    } else {
3193        vec![1.0 / n as f32; n]
3194    };
3195
3196    let mut rank = vec![1.0 / n as f32; n];
3197    let mut next_rank = vec![0.0_f32; n];
3198
3199    for _ in 0..MAX_ITERATIONS {
3200        // Collect dangling mass (nodes with no outgoing edges)
3201        let dangling: f32 = rank
3202            .iter()
3203            .enumerate()
3204            .filter(|&(i, _)| out_edges[i].is_empty())
3205            .map(|(_, &r)| r)
3206            .sum();
3207
3208        // Distribute rank
3209        for (i, nr) in next_rank.iter_mut().enumerate() {
3210            *nr = (1.0 - DAMPING).mul_add(bias[i], DAMPING * dangling * bias[i]);
3211        }
3212
3213        for (src, edges_list) in out_edges.iter().enumerate() {
3214            if edges_list.is_empty() {
3215                continue;
3216            }
3217            let src_rank = rank[src];
3218            let total_w = out_weight[src];
3219            for &(dst, w) in edges_list {
3220                next_rank[dst] += DAMPING * src_rank * (w / total_w);
3221            }
3222        }
3223
3224        // Check convergence
3225        let diff: f32 = rank
3226            .iter()
3227            .zip(next_rank.iter())
3228            .map(|(a, b)| (a - b).abs())
3229            .sum();
3230
3231        std::mem::swap(&mut rank, &mut next_rank);
3232
3233        if diff < EPSILON {
3234            break;
3235        }
3236    }
3237
3238    rank
3239}
3240
3241// ── Graph Building ───────────────────────────────────────────────────
3242
3243/// Intermediate result from definition-level graph computation.
3244struct DefGraphData {
3245    def_edges: Vec<(DefId, DefId, u32)>,
3246    def_ranks: Vec<f32>,
3247    def_callers: Vec<Vec<DefId>>,
3248    def_callees: Vec<Vec<DefId>>,
3249    offsets: Vec<usize>,
3250    base_ranks: Vec<f32>,
3251    file_edges: Vec<(u32, u32, u32)>,
3252}
3253
3254/// Build bidirectional trait↔impl method edges for PageRank propagation (G3).
3255///
3256/// For every impl method that overrides a trait method, adds:
3257/// - `(impl_def_id, trait_def_id, 1)` — impl → trait
3258/// - `(trait_def_id, impl_def_id, 1)` — trait → impl
3259///
3260/// Detection heuristic: an impl method "overrides" a trait method when:
3261/// - The impl method's kind is `"function_item"` and its `scope` starts with
3262///   `"impl_item"`.
3263/// - The trait method's kind is `"function_signature_item"` and its `scope`
3264///   starts with `"trait_item"`.
3265/// - Both have the same `name`.
3266/// - The impl's file imports the trait's file (or they share a file).
3267///
3268/// This is heuristic, not sound: it may produce false positives when two
3269/// unrelated traits define methods with the same name. The practical false-
3270/// positive rate on real Rust codebases is low because method names are
3271/// usually unique within a crate.
3272#[must_use]
3273pub fn build_trait_impl_edges_pub(files: &[FileNode]) -> Vec<(DefId, DefId, u32)> {
3274    build_trait_impl_edges(files)
3275}
3276
3277fn build_trait_impl_edges(files: &[FileNode]) -> Vec<(DefId, DefId, u32)> {
3278    // Build index: method_name → list of (DefId, is_trait_method).
3279    // trait method: kind == "function_signature_item" (abstract) OR scope contains "trait_item".
3280    // impl method:  kind == "function_item" AND scope contains "impl_item".
3281    let mut trait_methods: HashMap<String, Vec<DefId>> = HashMap::new();
3282    let mut impl_methods: HashMap<String, Vec<DefId>> = HashMap::new();
3283
3284    for (fi, file) in files.iter().enumerate() {
3285        for (di, def) in file.defs.iter().enumerate() {
3286            #[expect(clippy::cast_possible_truncation)]
3287            let did: DefId = (fi as u32, di as u16);
3288            if def.kind == "function_signature_item"
3289                || (def.scope.starts_with("trait_item") && def.kind == "function_item")
3290            {
3291                trait_methods.entry(def.name.clone()).or_default().push(did);
3292            } else if def.kind == "function_item" && def.scope.starts_with("impl_item") {
3293                impl_methods.entry(def.name.clone()).or_default().push(did);
3294            }
3295        }
3296    }
3297
3298    // Pre-build imported-files sets to restrict matching.
3299    let imported_sets: Vec<std::collections::HashSet<u32>> = files
3300        .iter()
3301        .map(|f| {
3302            f.imports
3303                .iter()
3304                .filter_map(|imp| imp.resolved_idx)
3305                .collect()
3306        })
3307        .collect();
3308
3309    let mut edges: Vec<(DefId, DefId, u32)> = Vec::new();
3310
3311    for (name, trait_defs) in &trait_methods {
3312        let Some(impl_defs) = impl_methods.get(name) else {
3313            continue;
3314        };
3315        for &(tf, td) in trait_defs {
3316            for &(imf, imd) in impl_defs {
3317                // The impl file must import the trait file (or be the same file).
3318                let connected = tf == imf
3319                    || imported_sets
3320                        .get(imf as usize)
3321                        .is_some_and(|s| s.contains(&tf));
3322                if connected {
3323                    let trait_id: DefId = (tf, td);
3324                    let impl_id: DefId = (imf, imd);
3325                    edges.push((trait_id, impl_id, 1));
3326                    edges.push((impl_id, trait_id, 1));
3327                }
3328            }
3329        }
3330    }
3331
3332    edges
3333}
3334
3335/// Build definition-level edges, compute `PageRank`, and derive file-level data.
3336fn compute_def_graph(files: &[FileNode]) -> DefGraphData {
3337    // Build definition-level edge list from resolved calls
3338    let mut def_edge_map: HashMap<(DefId, DefId), u32> = HashMap::new();
3339    for (file_idx, file) in files.iter().enumerate() {
3340        for (def_idx, def) in file.defs.iter().enumerate() {
3341            #[expect(clippy::cast_possible_truncation)]
3342            let caller_id: DefId = (file_idx as u32, def_idx as u16);
3343            for call in &def.calls {
3344                if let Some(callee_id) = call.resolved {
3345                    *def_edge_map.entry((caller_id, callee_id)).or_insert(0) += 1;
3346                }
3347            }
3348        }
3349    }
3350
3351    // Add trait↔impl bidirectional edges (G3).
3352    let trait_impl_edges = build_trait_impl_edges(files);
3353    for (src, dst, w) in trait_impl_edges {
3354        *def_edge_map.entry((src, dst)).or_insert(0) += w;
3355    }
3356
3357    let def_edges: Vec<(DefId, DefId, u32)> = def_edge_map
3358        .into_iter()
3359        .map(|((src, dst), w)| (src, dst, w))
3360        .collect();
3361
3362    // Compute def-level PageRank
3363    let offsets = def_offsets(files);
3364    let n_defs = *offsets.last().unwrap_or(&0);
3365
3366    let flat_def_edges: Vec<(u32, u32, u32)> = def_edges
3367        .iter()
3368        .map(|(src, dst, w)| {
3369            #[expect(clippy::cast_possible_truncation)]
3370            (
3371                flatten_def_id(&offsets, *src) as u32,
3372                flatten_def_id(&offsets, *dst) as u32,
3373                *w,
3374            )
3375        })
3376        .collect();
3377
3378    let def_ranks = pagerank(n_defs, &flat_def_edges, None);
3379
3380    // Derive file-level edges from def-level call edges. A cross-file def→def
3381    // edge contributes one file→file edge in the same direction; intra-file
3382    // edges are skipped (they cannot move rank between files).
3383    let mut file_edge_map: HashMap<(u32, u32), u32> = HashMap::new();
3384    for &(src, dst, w) in &def_edges {
3385        let src_file = src.0;
3386        let dst_file = dst.0;
3387        if src_file != dst_file {
3388            *file_edge_map.entry((src_file, dst_file)).or_insert(0) += w;
3389        }
3390    }
3391    let file_edges: Vec<(u32, u32, u32)> = file_edge_map
3392        .into_iter()
3393        .map(|((src, dst), w)| (src, dst, w))
3394        .collect();
3395
3396    // File-level rank: run PageRank directly on the file-level edge graph
3397    // (B-0025 fix, Cycle 11). The earlier formulation aggregated file rank
3398    // by summing per-def ranks of every def in the file, which amplified
3399    // the teleportation floor for files with high def cardinality but zero
3400    // in-degree (test files in particular). Running PageRank on
3401    // `file_edges` makes file rank obey the user-visible invariant: rank
3402    // flows from caller-file to callee-file, never from callee to caller.
3403    //
3404    // A file with no incoming file→file edges receives only the
3405    // teleportation floor `(1 - DAMPING) / n_files`, regardless of how
3406    // many defs it contains. Test files (which call hubs but are called
3407    // by no one) now correctly drop to the floor.
3408    let n_files = files.len();
3409    let base_ranks: Vec<f32> = if n_files == 0 {
3410        Vec::new()
3411    } else {
3412        pagerank(n_files, &file_edges, None)
3413    };
3414
3415    // Build def-level caller/callee lists
3416    let (def_callers, def_callees) = build_def_neighbor_lists(n_defs, &flat_def_edges, &offsets);
3417
3418    DefGraphData {
3419        def_edges,
3420        def_ranks,
3421        def_callers,
3422        def_callees,
3423        offsets,
3424        base_ranks,
3425        file_edges,
3426    }
3427}
3428
3429/// Build a dependency graph from a repository root.
3430///
3431/// Walks the directory tree, parses each supported file with tree-sitter,
3432/// extracts definitions and imports, resolves import paths to files, runs
3433/// `PageRank`, and builds caller/callee lists.
3434///
3435/// # Errors
3436///
3437/// Returns an error if file walking or reading fails.
3438#[expect(
3439    clippy::too_many_lines,
3440    reason = "three-phase parallel pipeline (walk+filter, def+import extraction, call extraction) \
3441              plus resolve + graph build; phases share state (file_index, raw_sources) and \
3442              cannot be meaningfully split without passing large mutable structures across \
3443              boundaries with no clarity gain"
3444)]
3445pub fn build_graph(root: &Path) -> crate::Result<RepoGraph> {
3446    let root = root.canonicalize().map_err(|e| crate::Error::Io {
3447        path: root.display().to_string(),
3448        source: e,
3449    })?;
3450
3451    let mut walk_options = walk::WalkOptions::default();
3452    if let Some((_, config)) = crate::cache::config::find_config(&root) {
3453        walk_options.ignore_patterns = config.ignore.patterns;
3454    }
3455    let all_files = walk::collect_files_with_options(&root, &walk_options);
3456
3457    // Phase 1: parallel filter + read. For each candidate path with a
3458    // supported extension, read its source from disk and emit a tuple
3459    // alongside its relative path. rayon spreads the I/O cost across
3460    // worker threads; on a 1M-file corpus this was ~20s sequential and
3461    // now sits in the 2-3s range bounded by disk + filter throughput.
3462    let raw_inputs: Vec<(PathBuf, String, String, String)> = all_files
3463        .par_iter()
3464        .filter_map(|path| {
3465            let ext = path
3466                .extension()
3467                .and_then(|e| e.to_str())
3468                .unwrap_or_default()
3469                .to_string();
3470            if languages::config_for_extension(&ext).is_none()
3471                && import_query_for_extension(&ext).is_none()
3472            {
3473                return None;
3474            }
3475            let source = std::fs::read_to_string(path).ok()?;
3476            let rel_path = path
3477                .strip_prefix(&root)
3478                .unwrap_or(path)
3479                .display()
3480                .to_string();
3481            Some((path.clone(), rel_path, ext, source))
3482        })
3483        .collect();
3484
3485    // Build the contiguous `files` Vec and the absolute-path -> idx
3486    // lookup. Sequential because both want stable indices that match
3487    // `raw_sources`'s order; the per-file work this gates is trivial.
3488    let mut file_index: HashMap<PathBuf, usize> = HashMap::with_capacity(raw_inputs.len());
3489    let mut files: Vec<FileNode> = Vec::with_capacity(raw_inputs.len());
3490    let mut raw_sources: Vec<(usize, String, String)> = Vec::with_capacity(raw_inputs.len());
3491    for (idx, (abs_path, rel_path, ext, source)) in raw_inputs.into_iter().enumerate() {
3492        file_index.insert(abs_path, idx);
3493        files.push(FileNode {
3494            path: rel_path,
3495            defs: vec![],
3496            imports: vec![],
3497        });
3498        raw_sources.push((idx, ext, source));
3499    }
3500
3501    // Phase 2: parallel per-file definition + import extraction. Each
3502    // file's tree-sitter parse + def/import queries are independent;
3503    // par_iter_mut over files.iter_mut().zip(raw_sources.par_iter())
3504    // lets every rayon worker grind its own slice. The closures here
3505    // borrow `&root` and `&file_index` immutably (both Sync) and write
3506    // disjoint `FileNode` slots via the &mut iterator.
3507    files
3508        .par_iter_mut()
3509        .zip(raw_sources.par_iter())
3510        .for_each(|(file, (_, ext, source))| {
3511            if let Some(config) = languages::config_for_extension(ext) {
3512                file.defs = extract_definitions(source, &config);
3513                // Go method_declaration scopes are empty after the generic
3514                // extract_definitions pass (the method is a top-level node
3515                // with no structural parent in CONTAINER_KINDS). Enrich them
3516                // with the receiver type so that resolve_calls Priority 2
3517                // (scope.contains(recv_type)) fires correctly for cross-file
3518                // Go receiver-method calls. This populates def_callers[] for
3519                // Go in compute_def_graph (P1 fix).
3520                if languages::is_go_language(&config.language) {
3521                    enrich_go_method_def_scopes(source, &mut file.defs);
3522                }
3523                // SQL: prepend a synthetic file-level def named after the
3524                // filename stem (dbt/sqlmesh convention). The whole-file
3525                // byte range becomes the smallest-enclosing fallback for
3526                // FROM/JOIN call-edges that are not inside any CTE, which
3527                // is the resolution target for cross-model references
3528                // (S1, Wave 4). file.path is relative to the repo root and
3529                // is what file_stem() needs to derive the model name.
3530                if languages::is_sql_language(&config.language) {
3531                    enrich_sql_file_def(&file.path, source, &mut file.defs);
3532                }
3533            }
3534            if let Some((lang, import_query)) = import_query_for_extension(ext) {
3535                let raw_imports = extract_imports(source, &lang, &import_query);
3536                let file_path = root.join(&file.path);
3537                file.imports = raw_imports
3538                    .into_iter()
3539                    .map(|raw| {
3540                        let resolved_idx =
3541                            resolve_import(&raw, ext, &file_path, &root, &file_index)
3542                                .and_then(|i| u32::try_from(i).ok());
3543                        ImportRef {
3544                            raw_path: raw,
3545                            resolved_idx,
3546                        }
3547                    })
3548                    .collect();
3549            }
3550        });
3551
3552    // Phase 3: parallel per-file call extraction. Mutates each
3553    // FileNode's `defs[*].calls` independently. Aligned with
3554    // raw_sources by index via the zip.
3555    files
3556        .par_iter_mut()
3557        .zip(raw_sources.par_iter())
3558        .for_each(|(file, (_, ext, source))| {
3559            if let Some(call_config) = languages::call_query_for_extension(ext) {
3560                extract_calls(source, &call_config, &mut file.defs);
3561            }
3562        });
3563
3564    // Build the Python class hierarchy (class_name → parent class names) by
3565    // walking every Python source file. The map is used by `resolve_calls`
3566    // Priority 2.5 to dispatch `self.method()` calls through the MRO when
3567    // the method lives on a parent / mixin class (Q1, Wave 2).
3568    //
3569    // Parallel: extract_python_class_hierarchy is pure per-file, then we
3570    // fold the per-file maps into one global map sequentially because
3571    // HashMap is not lock-free. On a 1k-Python-file corpus this fold takes
3572    // <10ms — much smaller than the parallel parse work that feeds it.
3573    let python_hierarchies: Vec<HashMap<String, Vec<String>>> = raw_sources
3574        .par_iter()
3575        .map(|(_, ext, source)| {
3576            if ext == "py" || ext == "pyi" {
3577                extract_python_class_hierarchy(source)
3578            } else {
3579                HashMap::new()
3580            }
3581        })
3582        .collect();
3583    let mut python_class_hierarchy: HashMap<String, Vec<String>> = HashMap::new();
3584    for local in python_hierarchies {
3585        for (k, v) in local {
3586            // First declaration wins on name collisions across files. The
3587            // MRO walk only needs a plausible parent chain to find an
3588            // ancestor's methods; this is conservative but acceptable.
3589            python_class_hierarchy.entry(k).or_insert(v);
3590        }
3591    }
3592
3593    // Resolve call references to target definitions
3594    let def_index = build_def_index(&files);
3595    resolve_calls(&mut files, &def_index, &python_class_hierarchy);
3596
3597    // Build def-level graph, compute PageRank, and derive file-level data
3598    let graph_data = compute_def_graph(&files);
3599
3600    // Build file-level caller/callee lists
3601    let n = files.len();
3602    let (callers, callees) = build_neighbor_lists(n, &graph_data.file_edges);
3603
3604    // Auto-tune alpha based on graph density
3605    #[expect(clippy::cast_precision_loss, reason = "graph sizes fit in f32")]
3606    let density = if n > 1 {
3607        graph_data.file_edges.len() as f32 / (n as f32 * (n as f32 - 1.0))
3608    } else {
3609        0.0
3610    };
3611    let alpha = 0.3f32.mul_add(density.min(1.0), 0.5);
3612
3613    Ok(RepoGraph {
3614        files,
3615        edges: graph_data.file_edges,
3616        base_ranks: graph_data.base_ranks,
3617        callers,
3618        callees,
3619        def_edges: graph_data.def_edges,
3620        def_ranks: graph_data.def_ranks,
3621        def_callers: graph_data.def_callers,
3622        def_callees: graph_data.def_callees,
3623        def_offsets: graph_data.offsets,
3624        alpha,
3625    })
3626}
3627
3628/// Build a `RepoGraph` directly from a pre-constructed `Vec<FileNode>`.
3629///
3630/// Skips the filesystem walk phase of [`build_graph`]; useful for integration
3631/// tests that want to build synthetic graphs without touching disk.
3632///
3633/// Resolves calls, builds the def-level graph (including G3 trait↔impl edges),
3634/// computes `PageRank`, and builds caller/callee lists.
3635#[must_use]
3636pub fn build_graph_from_files_pub(files: Vec<FileNode>) -> RepoGraph {
3637    let empty_hierarchy: HashMap<String, Vec<String>> = HashMap::new();
3638    build_graph_from_files_with_mro_pub(files, &empty_hierarchy)
3639}
3640
3641/// Build a `RepoGraph` from synthetic files with Python MRO resolution enabled.
3642///
3643/// Identical to [`build_graph_from_files_pub`] except that the resolver's
3644/// Priority 2.5 (MRO walk) fires against the caller-supplied
3645/// `python_class_hierarchy` map. Use this when an integration test needs to
3646/// exercise `self.method()` dispatch through inherited classes — including
3647/// the inverse-edge propagation into [`RepoGraph::def_callers`] that
3648/// `lsp_incoming_calls` consumes.
3649///
3650/// I#58 / 4.1.3: when a `SubScreen.method_caller` calls `self.method()` and
3651/// the MRO walk binds the call to `Mixin.method`, the forward edge
3652/// `(SubScreen.method_caller, Mixin.method)` is recorded in
3653/// [`DefGraphData::def_edges`]. [`build_def_neighbor_lists`] then derives
3654/// symmetric inverse-edge entries: `def_callers[Mixin.method]` includes
3655/// `SubScreen.method_caller`. This is the property `lsp_incoming_calls` reads.
3656#[must_use]
3657pub fn build_graph_from_files_with_mro_pub<H>(
3658    mut files: Vec<FileNode>,
3659    python_class_hierarchy: &HashMap<String, Vec<String>, H>,
3660) -> RepoGraph
3661where
3662    H: std::hash::BuildHasher,
3663{
3664    let def_index = build_def_index(&files);
3665    resolve_calls(&mut files, &def_index, python_class_hierarchy);
3666    let graph_data = compute_def_graph(&files);
3667    let n = files.len();
3668    let (callers, callees) = build_neighbor_lists(n, &graph_data.file_edges);
3669
3670    #[expect(clippy::cast_precision_loss, reason = "graph sizes fit in f32")]
3671    let density = if n > 1 {
3672        graph_data.file_edges.len() as f32 / (n as f32 * (n as f32 - 1.0))
3673    } else {
3674        0.0
3675    };
3676    let alpha = 0.3f32.mul_add(density.min(1.0), 0.5);
3677
3678    RepoGraph {
3679        files,
3680        edges: graph_data.file_edges,
3681        base_ranks: graph_data.base_ranks,
3682        callers,
3683        callees,
3684        def_edges: graph_data.def_edges,
3685        def_ranks: graph_data.def_ranks,
3686        def_callers: graph_data.def_callers,
3687        def_callees: graph_data.def_callees,
3688        def_offsets: graph_data.offsets,
3689        alpha,
3690    }
3691}
3692
3693// ── Dead-code analysis ───────────────────────────────────────────────────────
3694
3695/// Global flat definition index: `def_offsets[file_idx] + def_idx_within_file`.
3696///
3697/// This is the natural index into [`RepoGraph::def_ranks`],
3698/// [`RepoGraph::def_callers`], and [`RepoGraph::def_callees`].
3699pub type DefIndex = usize;
3700
3701/// A connected component in the dead-code subgraph.
3702///
3703/// All members are definitions that are unreachable from any entry point.
3704/// The component is formed by treating `def_callees + def_callers` edges as
3705/// undirected — so a transitively-dead group (including mutual-recursion
3706/// cycles that are collectively unreachable) surfaces as one cluster.
3707#[derive(Debug, Clone)]
3708pub struct DeadCluster {
3709    /// Global flat def index of the cluster root (the member with the
3710    /// highest [`RepoGraph::def_ranks`] score).
3711    pub root_def_idx: usize,
3712    /// Number of definitions in this cluster.
3713    pub size: usize,
3714    /// Sum of `end_line - start_line` for every member definition.
3715    pub total_lines: usize,
3716    /// All member global def indices, root first.
3717    pub member_def_indices: Vec<usize>,
3718}
3719
3720/// Confidence level for [`DeadCodeReport::dead_fraction`].
3721///
3722/// Callers should interpret `dead_fraction` according to this indicator.
3723/// When entry-point coverage is sparse (common in test-heavy corpora or
3724/// macro-dispatched frameworks), the raw fraction can be misleading — the
3725/// cluster list is always more trustworthy than the absolute number.
3726///
3727/// ## Rubric
3728///
3729/// | Level  | Condition                                                              |
3730/// |--------|------------------------------------------------------------------------|
3731/// | High   | entry coverage ≥ 10 % AND ≥ 1 LibraryExport AND ≥ 1 Main              |
3732/// | Medium | entry coverage ≥ 2 % AND ≥ 1 production entry AND tests not dominant  |
3733/// | Low    | entry coverage < 2 % OR tests > 80 % of all entries                   |
3734///
3735/// "Production entries" = LibraryExport + FrameworkDispatched + Main + Ffi.
3736#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
3737#[serde(rename_all = "snake_case")]
3738pub enum DeadCodeConfidence {
3739    /// Entry-point coverage is dense; `dead_fraction` is a quantitative signal.
3740    High,
3741    /// `dead_fraction` is directional; trust the cluster list more than the number.
3742    Medium,
3743    /// `dead_fraction` may mislead (e.g., tests-dominated corpus); use cluster
3744    /// `member_defs` + `lsp_references` to verify each candidate before deletion.
3745    Low,
3746}
3747
3748/// Compute a [`DeadCodeConfidence`] level from entry-point kind counts.
3749///
3750/// Parameters mirror the counts emitted by
3751/// [`crate::entry_points::summarize_entry_point_kinds`]:
3752///
3753/// - `total_defs` — total number of definitions in the graph.
3754/// - `library_exports` — count of [`crate::entry_points::EntryPointKind::LibraryExport`] entries.
3755/// - `framework_dispatched` — count of [`crate::entry_points::EntryPointKind::FrameworkDispatched`] entries.
3756/// - `main_entries` — count of [`crate::entry_points::EntryPointKind::Main`] entries.
3757/// - `test_entries` — count of [`crate::entry_points::EntryPointKind::Test`] entries.
3758/// - `ffi_entries` — count of [`crate::entry_points::EntryPointKind::Ffi`] entries.
3759///
3760/// ProcMacro / Init / BuildScript entries are not passed separately; they are
3761/// rare enough that their absence does not materially affect the heuristic.
3762#[must_use]
3763#[expect(
3764    clippy::cast_precision_loss,
3765    reason = "entry and def counts fit comfortably in f64 for ratio computation"
3766)]
3767pub fn compute_confidence(
3768    total_defs: usize,
3769    library_exports: usize,
3770    framework_dispatched: usize,
3771    main_entries: usize,
3772    test_entries: usize,
3773    ffi_entries: usize,
3774) -> DeadCodeConfidence {
3775    let total_entries =
3776        library_exports + framework_dispatched + main_entries + test_entries + ffi_entries;
3777    let entry_coverage = if total_defs > 0 {
3778        total_entries as f64 / total_defs as f64
3779    } else {
3780        0.0
3781    };
3782    let test_dominant = total_entries > 0 && (test_entries as f64 / total_entries as f64) > 0.8;
3783    let production_entries = library_exports + framework_dispatched + main_entries + ffi_entries;
3784
3785    if entry_coverage >= 0.10 && library_exports >= 1 && main_entries >= 1 {
3786        DeadCodeConfidence::High
3787    } else if entry_coverage >= 0.02 && production_entries >= 1 && !test_dominant {
3788        DeadCodeConfidence::Medium
3789    } else {
3790        DeadCodeConfidence::Low
3791    }
3792}
3793
3794/// Summary report from [`compute_dead_code`].
3795///
3796/// The primary consumer is X3's `mcp__ripvec__find_dead_code` MCP tool.
3797#[derive(Debug, Clone)]
3798pub struct DeadCodeReport {
3799    /// Dead clusters sorted by size descending (largest first).
3800    pub dead_clusters: Vec<DeadCluster>,
3801    /// Total number of definitions unreachable from any entry point.
3802    pub total_dead_defs: usize,
3803    /// Total number of definitions reachable from at least one entry point.
3804    pub total_live_defs: usize,
3805    /// Fraction of all definitions that are dead: `dead / (dead + live)`.
3806    pub dead_fraction: f32,
3807    /// Confidence that `dead_fraction` is a reliable signal.
3808    ///
3809    /// Defaults to [`DeadCodeConfidence::Low`] until the caller supplies
3810    /// entry-point kind counts via [`compute_confidence`].
3811    pub confidence: DeadCodeConfidence,
3812}
3813
3814/// Returns true if the given path is a test-related path.
3815///
3816/// Heuristic: the path contains `tests/`, `/spec/`, `/specs/`, or the
3817/// file stem starts/ends with `test_`/`_test` or contains `bench`.
3818fn is_test_path(path: &str) -> bool {
3819    let path_lc = path.to_lowercase();
3820    if path_lc.contains("tests/") || path_lc.contains("/spec/") || path_lc.contains("/specs/") {
3821        return true;
3822    }
3823    let file_name = path.rsplit('/').next().unwrap_or(path);
3824    let stem = file_name.split('.').next().unwrap_or(file_name);
3825    stem.starts_with("test_") || stem.ends_with("_test") || stem.contains("bench")
3826}
3827
3828/// Resolve a flat [`DefIndex`] to its file index using the `def_offsets`
3829/// prefix-sum table.
3830fn flat_to_file_idx(offsets: &[usize], flat: DefIndex) -> usize {
3831    offsets.partition_point(|&o| o <= flat).saturating_sub(1)
3832}
3833
3834/// Union-find: path-compressing find. Returns the root representative of `x`.
3835fn uf_find(parent: &mut Vec<usize>, x: usize) -> usize {
3836    if parent[x] != x {
3837        parent[x] = uf_find(parent, parent[x]);
3838    }
3839    parent[x]
3840}
3841
3842/// Union-find: merge the components containing `x` and `y`.
3843fn uf_union(parent: &mut Vec<usize>, x: usize, y: usize) {
3844    let rx = uf_find(parent, x);
3845    let ry = uf_find(parent, y);
3846    if rx != ry {
3847        parent[rx] = ry;
3848    }
3849}
3850
3851/// Compute the set of definitions unreachable from any entry point.
3852///
3853/// Returns dead clusters (connected components in the unreachable subgraph),
3854/// sorted by size descending.
3855///
3856/// # Algorithm
3857///
3858/// 1. Optionally filter test paths from `entry_def_indices` (when
3859///    `include_test_paths` is `false`). Test-path heuristic: the file path
3860///    contains `test_`, `_test`, `tests/`, `spec/`, `specs/`, or `bench`.
3861/// 2. Build full forward + reverse adjacency from [`RepoGraph::def_edges`]
3862///    (the untruncated edge list) as a Compressed Sparse Row (CSR) pair
3863///    of (`starts`, `dst`) `Vec<u32>` arrays — bounded O(E) memory rather
3864///    than the O(n_defs * avg_fanout) of duplicated `Vec<Vec<DefIndex>>`
3865///    that crashed at kernel scale (I#61). The BFS does NOT use
3866///    [`RepoGraph::def_callees`] / [`RepoGraph::def_callers`]: those are
3867///    display-oriented neighbor lists capped at [`MAX_NEIGHBORS`] per node,
3868///    which dropped hub callees from the live set and produced
3869///    false-positive dead reports (I#57).
3870/// 3. BFS forward over the full forward adjacency from the entry seeds ->
3871///    reachable set.
3872/// 4. Complement `(all_defs - reachable)` = dead set.
3873/// 5. Connected-components on the dead subgraph via the full forward +
3874///    reverse adjacency treated as undirected (union-find).
3875/// 5. For each cluster: pick the highest-rank def as the cluster root;
3876///    `size` = member count; `total_lines` = sum of `(end_line -
3877///    start_line)` per member.
3878/// 6. Sort clusters by size descending.
3879#[must_use]
3880#[expect(
3881    clippy::too_many_lines,
3882    reason = "six-step BFS+clustering pipeline; splitting into sub-functions \
3883              would require passing many interdependent Vec borrows with no \
3884              clarity gain"
3885)]
3886pub fn compute_dead_code<S: std::hash::BuildHasher>(
3887    graph: &RepoGraph,
3888    entry_def_indices: &HashSet<DefIndex, S>,
3889    include_test_paths: bool,
3890) -> DeadCodeReport {
3891    let n_defs = graph.def_ranks.len();
3892    if n_defs == 0 {
3893        return DeadCodeReport {
3894            dead_clusters: vec![],
3895            total_dead_defs: 0,
3896            total_live_defs: 0,
3897            dead_fraction: 0.0,
3898            confidence: DeadCodeConfidence::Low,
3899        };
3900    }
3901
3902    // Step 1: build the effective seed set.
3903    let seeds: Vec<DefIndex> = if include_test_paths {
3904        entry_def_indices.iter().copied().collect()
3905    } else {
3906        entry_def_indices
3907            .iter()
3908            .copied()
3909            .filter(|&flat| {
3910                let file_idx = flat_to_file_idx(&graph.def_offsets, flat);
3911                let path = graph
3912                    .files
3913                    .get(file_idx)
3914                    .map(|f| f.path.as_str())
3915                    .unwrap_or("");
3916                !is_test_path(path)
3917            })
3918            .collect()
3919    };
3920
3921    // Build CSR-style forward + reverse adjacency from `graph.def_edges`.
3922    //
3923    // We do NOT use `graph.def_callees` / `graph.def_callers` here because
3924    // those are display-oriented neighbor lists truncated to MAX_NEIGHBORS
3925    // in `build_def_neighbor_lists`. Reading a truncated forward list during
3926    // BFS produced false-positive dead reports (I#57): a hub function with
3927    // more than 5 outgoing call edges silently dropped its low-weight
3928    // callees from the live set, and every helper transitively reachable
3929    // only through those dropped edges was marked dead.
3930    //
3931    // The original I#57 fix materialized `full_callees` / `full_callers` as
3932    // two `Vec<Vec<DefIndex>>` tables, which at kernel scale (n_defs ≈
3933    // 600 K with high-fan-out hubs) cost ~1.9 GB and OOM-crashed the MCP
3934    // server on every parameter variant (I#61, regression).
3935    //
3936    // This implementation uses Compressed Sparse Row (CSR) bucketing:
3937    // - `fwd_starts[i..=i+1]` slices `fwd_dst` to the destinations of node i.
3938    // - `rev_starts[i..=i+1]` slices `rev_src` to the sources that call i.
3939    // Storage is bounded O(E) (one u32 per directed edge × 2 directions)
3940    // plus O(n_defs) for the two start arrays. For a 600 K-def / 5 M-edge
3941    // kernel graph this is ~80 MB rather than ~1.9 GB.
3942    //
3943    // u32 is sufficient: every flat DefIndex value is < n_defs and ripvec
3944    // already constrains `n_defs <= u32::MAX` upstream (DefId.0 is u32).
3945    #[expect(
3946        clippy::cast_possible_truncation,
3947        reason = "n_defs <= u32::MAX is a ripvec-wide invariant; the per-edge \
3948                  cast cannot overflow"
3949    )]
3950    let (fwd_starts, fwd_dst, rev_starts, rev_src) = {
3951        // First pass: per-node out-degree (forward) and in-degree (reverse).
3952        let mut fwd_deg: Vec<u32> = vec![0; n_defs];
3953        let mut rev_deg: Vec<u32> = vec![0; n_defs];
3954        for &(src, dst, _w) in &graph.def_edges {
3955            let src_flat = graph.def_offsets[src.0 as usize] + src.1 as usize;
3956            let dst_flat = graph.def_offsets[dst.0 as usize] + dst.1 as usize;
3957            if src_flat < n_defs && dst_flat < n_defs {
3958                fwd_deg[src_flat] += 1;
3959                rev_deg[dst_flat] += 1;
3960            }
3961        }
3962        // Exclusive prefix sums -> bucket starts of length n_defs + 1.
3963        let mut fwd_starts: Vec<u32> = Vec::with_capacity(n_defs + 1);
3964        let mut rev_starts: Vec<u32> = Vec::with_capacity(n_defs + 1);
3965        let mut acc_f: u32 = 0;
3966        let mut acc_r: u32 = 0;
3967        fwd_starts.push(0);
3968        rev_starts.push(0);
3969        for i in 0..n_defs {
3970            acc_f = acc_f.saturating_add(fwd_deg[i]);
3971            acc_r = acc_r.saturating_add(rev_deg[i]);
3972            fwd_starts.push(acc_f);
3973            rev_starts.push(acc_r);
3974        }
3975        // Second pass: place each edge using a per-node cursor.
3976        let total_edges = acc_f as usize;
3977        let mut fwd_dst: Vec<u32> = vec![0u32; total_edges];
3978        let mut rev_src: Vec<u32> = vec![0u32; total_edges];
3979        // Reuse fwd_deg / rev_deg as cursors counting down from per-bucket capacity.
3980        // Easier: cursors start at the bucket start.
3981        let mut fwd_cursor: Vec<u32> = fwd_starts[..n_defs].to_vec();
3982        let mut rev_cursor: Vec<u32> = rev_starts[..n_defs].to_vec();
3983        for &(src, dst, _w) in &graph.def_edges {
3984            let src_flat = graph.def_offsets[src.0 as usize] + src.1 as usize;
3985            let dst_flat = graph.def_offsets[dst.0 as usize] + dst.1 as usize;
3986            if src_flat < n_defs && dst_flat < n_defs {
3987                let f = fwd_cursor[src_flat] as usize;
3988                fwd_dst[f] = dst_flat as u32;
3989                fwd_cursor[src_flat] += 1;
3990                let r = rev_cursor[dst_flat] as usize;
3991                rev_src[r] = src_flat as u32;
3992                rev_cursor[dst_flat] += 1;
3993            }
3994        }
3995        (fwd_starts, fwd_dst, rev_starts, rev_src)
3996    };
3997
3998    // Helper closures: O(1) bucket lookup into the CSR forward / reverse views.
3999    let fwd_callees = |flat: DefIndex| -> &[u32] {
4000        let lo = fwd_starts[flat] as usize;
4001        let hi = fwd_starts[flat + 1] as usize;
4002        &fwd_dst[lo..hi]
4003    };
4004    let rev_callers = |flat: DefIndex| -> &[u32] {
4005        let lo = rev_starts[flat] as usize;
4006        let hi = rev_starts[flat + 1] as usize;
4007        &rev_src[lo..hi]
4008    };
4009
4010    // Step 2: BFS forward over the CSR forward view from seeds -> reachable set.
4011    let mut reachable: Vec<bool> = vec![false; n_defs];
4012    let mut queue: std::collections::VecDeque<DefIndex> = std::collections::VecDeque::new();
4013
4014    for seed in &seeds {
4015        if *seed < n_defs && !reachable[*seed] {
4016            reachable[*seed] = true;
4017            queue.push_back(*seed);
4018        }
4019    }
4020
4021    while let Some(flat) = queue.pop_front() {
4022        for &callee_flat_u32 in fwd_callees(flat) {
4023            let callee_flat = callee_flat_u32 as usize;
4024            if callee_flat < n_defs && !reachable[callee_flat] {
4025                reachable[callee_flat] = true;
4026                queue.push_back(callee_flat);
4027            }
4028        }
4029    }
4030
4031    // Step 3: dead set = all_defs - reachable.
4032    let dead: Vec<DefIndex> = (0..n_defs).filter(|&i| !reachable[i]).collect();
4033    let total_dead_defs = dead.len();
4034    let total_live_defs = n_defs - total_dead_defs;
4035
4036    if dead.is_empty() {
4037        return DeadCodeReport {
4038            dead_clusters: vec![],
4039            total_dead_defs: 0,
4040            total_live_defs,
4041            dead_fraction: 0.0,
4042            confidence: DeadCodeConfidence::Low,
4043        };
4044    }
4045
4046    // Step 4: connected components on the dead subgraph via union-find.
4047    let dead_set: HashSet<DefIndex> = dead.iter().copied().collect();
4048    let dead_pos: HashMap<DefIndex, usize> = dead
4049        .iter()
4050        .copied()
4051        .enumerate()
4052        .map(|(pos, idx)| (idx, pos))
4053        .collect();
4054    let m = dead.len();
4055    let mut parent: Vec<usize> = (0..m).collect();
4056
4057    // Use the same CSR forward / reverse views built above for the BFS so
4058    // the dead subgraph's connected components reflect every actual call
4059    // edge — not the [`MAX_NEIGHBORS`]-truncated display view (I#57). The CSR
4060    // layout keeps storage O(E) at kernel scale rather than the duplicated
4061    // O(n_defs * avg_fanout) of the pre-I#61 `Vec<Vec<DefIndex>>` (I#61).
4062    for &flat in &dead {
4063        let pos_flat = dead_pos[&flat];
4064        for &callee_flat_u32 in fwd_callees(flat) {
4065            let callee_flat = callee_flat_u32 as usize;
4066            if dead_set.contains(&callee_flat) {
4067                let pos_callee = dead_pos[&callee_flat];
4068                uf_union(&mut parent, pos_flat, pos_callee);
4069            }
4070        }
4071        for &caller_flat_u32 in rev_callers(flat) {
4072            let caller_flat = caller_flat_u32 as usize;
4073            if dead_set.contains(&caller_flat) {
4074                let pos_caller = dead_pos[&caller_flat];
4075                uf_union(&mut parent, pos_flat, pos_caller);
4076            }
4077        }
4078    }
4079
4080    // Flatten roots (path compression for all).
4081    for i in 0..m {
4082        uf_find(&mut parent, i);
4083    }
4084
4085    // Group members by their component root.
4086    let mut components: HashMap<usize, Vec<DefIndex>> = HashMap::new();
4087    for (pos, &flat) in dead.iter().enumerate() {
4088        let root_pos = parent[pos];
4089        components.entry(root_pos).or_default().push(flat);
4090    }
4091
4092    // Step 5: build clusters - root is the highest-rank member.
4093    let mut clusters: Vec<DeadCluster> = components
4094        .into_values()
4095        .map(|members| {
4096            let root_flat = members
4097                .iter()
4098                .copied()
4099                .max_by(|&a, &b| {
4100                    let ra = graph.def_ranks.get(a).copied().unwrap_or(0.0);
4101                    let rb = graph.def_ranks.get(b).copied().unwrap_or(0.0);
4102                    ra.total_cmp(&rb)
4103                })
4104                .unwrap_or(members[0]);
4105
4106            let total_lines: usize = members
4107                .iter()
4108                .copied()
4109                .map(|flat| {
4110                    let file_idx = flat_to_file_idx(&graph.def_offsets, flat);
4111                    let def_idx = flat - graph.def_offsets[file_idx];
4112                    let def = graph.files.get(file_idx).and_then(|f| f.defs.get(def_idx));
4113                    def.map(|d| (d.end_line as usize).saturating_sub(d.start_line as usize))
4114                        .unwrap_or(0)
4115                })
4116                .sum();
4117
4118            let mut member_def_indices: Vec<usize> = std::iter::once(root_flat)
4119                .chain(members.iter().copied().filter(|&m| m != root_flat))
4120                .collect();
4121            if member_def_indices.len() > 1 {
4122                member_def_indices[1..].sort_unstable();
4123            }
4124
4125            DeadCluster {
4126                root_def_idx: root_flat,
4127                size: member_def_indices.len(),
4128                total_lines,
4129                member_def_indices,
4130            }
4131        })
4132        .collect();
4133
4134    // Step 6: sort by size descending.
4135    clusters.sort_by(|a, b| {
4136        b.size
4137            .cmp(&a.size)
4138            .then(b.root_def_idx.cmp(&a.root_def_idx))
4139    });
4140
4141    #[expect(
4142        clippy::cast_precision_loss,
4143        reason = "def counts fit comfortably in f32"
4144    )]
4145    let dead_fraction = if n_defs > 0 {
4146        total_dead_defs as f32 / n_defs as f32
4147    } else {
4148        0.0
4149    };
4150
4151    DeadCodeReport {
4152        dead_clusters: clusters,
4153        total_dead_defs,
4154        total_live_defs,
4155        dead_fraction,
4156        // Caller must set confidence via compute_confidence() after supplying
4157        // entry-point kind counts. Defaulting to Low is conservative.
4158        confidence: DeadCodeConfidence::Low,
4159    }
4160}
4161
4162impl RepoGraph {
4163    /// Get the `PageRank` score for a specific definition.
4164    #[must_use]
4165    pub fn def_rank(&self, did: DefId) -> f32 {
4166        let flat = self.def_offsets[did.0 as usize] + did.1 as usize;
4167        self.def_ranks.get(flat).copied().unwrap_or(0.0)
4168    }
4169
4170    /// Look up a definition by file path and name. Returns the first match.
4171    #[must_use]
4172    pub fn find_def(&self, file_path: &str, def_name: &str) -> Option<DefId> {
4173        for (file_idx, file) in self.files.iter().enumerate() {
4174            if file.path == file_path {
4175                for (def_idx, def) in file.defs.iter().enumerate() {
4176                    if def.name == def_name {
4177                        #[expect(clippy::cast_possible_truncation)]
4178                        return Some((file_idx as u32, def_idx as u16));
4179                    }
4180                }
4181            }
4182        }
4183        None
4184    }
4185
4186    /// Resolve a caller-supplied `focus_file` string to a file index in [`Self::files`].
4187    ///
4188    /// Accepts any of the path forms that ripvec itself emits or accepts:
4189    ///
4190    /// - **Exact stored path** (`device_opt/services/storage.py`) — direct match.
4191    /// - **LSP-shaped path** (`./device_opt/services/storage.py`) — the `./`
4192    ///   prefix used by every [`RepoMapLspLocation::file_path`] is stripped
4193    ///   before comparison so the documented chaining pattern
4194    ///   `get_repo_map(focus_file=hits[0].lsp_location.file_path)` works.
4195    /// - **Strict suffix** (`storage.py`, `services/storage.py`) — match when
4196    ///   the previous character in the stored path is `/`. Avoids matching
4197    ///   `foo_storage.py` for `storage.py`.
4198    ///
4199    /// Returns [`FocusResolution::Found`] when exactly one file matches,
4200    /// [`FocusResolution::Ambiguous`] when multiple files match (the caller
4201    /// surfaces the candidate list to the user), and [`FocusResolution::NotFound`]
4202    /// when no file matches.
4203    ///
4204    /// # Background
4205    ///
4206    /// Prior to this helper the MCP layer (`crates/ripvec-mcp/src/tools.rs`)
4207    /// did the matching inline with two bugs:
4208    ///
4209    /// 1. **`./` prefix mismatch.** [`RepoMapLspLocation::file_path`] always
4210    ///    carries a leading `./` (see [`file_lsp_location`]), but
4211    ///    [`FileNode::path`] does not. Passing the LSP location verbatim as
4212    ///    `focus_file` matched zero files. The matcher silently returned
4213    ///    `focus = None`, producing rank values bit-identical to the unfocused
4214    ///    call — the bug originally reported as "I#20 focus_file rebias
4215    ///    invisible on Python".
4216    /// 2. **Equal-length false negative.** When the user passed
4217    ///    `./device_opt/services/storage.py` and the stored path was
4218    ///    `device_opt/services/storage.py`, `exact` was false (the strings
4219    ///    differ by two bytes) and `strict_suffix` was false (the focus is
4220    ///    longer than the stored path, so `p.len() > focus.len()` fails). The
4221    ///    pathology surfaced specifically when the focus was a *full* path
4222    ///    with the LSP `./` prefix.
4223    ///
4224    /// Centralising the resolution here gives every caller the same
4225    /// normalization-tolerant semantics and one place to test the contract.
4226    #[must_use]
4227    pub fn resolve_focus_file(&self, focus: &str) -> FocusResolution {
4228        let normalized = normalize_focus_path(focus);
4229        let matches: Vec<usize> = self
4230            .files
4231            .iter()
4232            .enumerate()
4233            .filter_map(|(idx, f)| {
4234                if focus_matches_path(&f.path, normalized) {
4235                    Some(idx)
4236                } else {
4237                    None
4238                }
4239            })
4240            .collect();
4241        match matches.len() {
4242            0 => FocusResolution::NotFound,
4243            1 => FocusResolution::Found(matches[0]),
4244            _ => FocusResolution::Ambiguous(
4245                matches
4246                    .into_iter()
4247                    .map(|i| self.files[i].path.clone())
4248                    .collect(),
4249            ),
4250        }
4251    }
4252}
4253
4254/// Result of resolving a user-supplied `focus_file` string against a [`RepoGraph`].
4255///
4256/// See [`RepoGraph::resolve_focus_file`] for the resolution semantics and the
4257/// historical bug that motivated the helper.
4258#[derive(Debug, Clone)]
4259pub enum FocusResolution {
4260    /// Exactly one file matched. Carries the file index in [`RepoGraph::files`].
4261    Found(usize),
4262    /// No file matched. The caller treats this as an unfocused call.
4263    NotFound,
4264    /// Two or more files matched. The caller surfaces the candidate list so
4265    /// the user can disambiguate by passing a longer suffix or the full path.
4266    Ambiguous(Vec<String>),
4267}
4268
4269/// Strip the leading `./` prefix from a focus_file path.
4270///
4271/// The `./` form is produced by [`file_lsp_location`] for every
4272/// [`RepoMapLspLocation::file_path`] field on a relative path. Stripping it
4273/// gives a stored-path-shaped value for the suffix matcher to compare
4274/// against [`FileNode::path`] entries (which do not carry the prefix).
4275///
4276/// Absolute paths (`/abs/path/file.py`) are returned unchanged; they will
4277/// fail the suffix match against the relative stored paths, which is the
4278/// correct behavior (the caller meant a different root entirely).
4279fn normalize_focus_path(focus: &str) -> &str {
4280    focus.strip_prefix("./").unwrap_or(focus)
4281}
4282
4283/// Return true when `focus` matches `stored_path` as either an exact path or
4284/// a strict-suffix (must be preceded by `/`). The empty focus does not match.
4285fn focus_matches_path(stored_path: &str, focus: &str) -> bool {
4286    if focus.is_empty() {
4287        return false;
4288    }
4289    if stored_path == focus {
4290        return true;
4291    }
4292    stored_path.len() > focus.len()
4293        && stored_path.ends_with(focus)
4294        && stored_path.as_bytes()[stored_path.len() - focus.len() - 1] == b'/'
4295}
4296
4297/// Build top-N caller and callee lists for each file.
4298///
4299/// Given a list of weighted directed edges `(src, dst, weight)` over `n`
4300/// nodes, returns `(callers[i], callees[i])` for each node `i`, where each
4301/// list contains the top-[`MAX_NEIGHBORS`] adjacent nodes sorted by descending
4302/// edge weight.
4303///
4304/// Exposed as `pub` so that integration tests can construct synthetic
4305/// [`RepoGraph`] instances for unit-testing the JSON rendering without going
4306/// through a full disk walk.
4307#[must_use]
4308pub fn build_neighbor_lists(n: usize, edges: &[(u32, u32, u32)]) -> (Vec<Vec<u32>>, Vec<Vec<u32>>) {
4309    let mut incoming: Vec<Vec<(u32, u32)>> = vec![vec![]; n];
4310    let mut outgoing: Vec<Vec<(u32, u32)>> = vec![vec![]; n];
4311
4312    for &(src, dst, w) in edges {
4313        let (s, d) = (src as usize, dst as usize);
4314        if s < n && d < n {
4315            incoming[d].push((src, w));
4316            outgoing[s].push((dst, w));
4317        }
4318    }
4319
4320    // Sort by weight descending, keep top N
4321    let trim = |lists: &mut [Vec<(u32, u32)>]| -> Vec<Vec<u32>> {
4322        lists
4323            .iter_mut()
4324            .map(|list| {
4325                list.sort_by_key(|b| std::cmp::Reverse(b.1));
4326                list.iter()
4327                    .take(MAX_NEIGHBORS)
4328                    .map(|(idx, _)| *idx)
4329                    .collect()
4330            })
4331            .collect()
4332    };
4333
4334    (trim(&mut incoming), trim(&mut outgoing))
4335}
4336
4337// ── Rendering ────────────────────────────────────────────────────────
4338
4339/// Render a budget-constrained overview of the repository.
4340///
4341/// Files are sorted by `PageRank` (or topic-sensitive rank if `focus` is
4342/// `Some`). Output uses four tiers of decreasing detail:
4343///
4344/// - **Tier 0** (top 10%): full path, rank, callers/callees, signatures with scopes
4345/// - **Tier 1** (next 20%): full path, rank, signatures
4346/// - **Tier 2** (next 40%): full path, rank, definition names and kinds
4347/// - **Tier 3** (bottom 30%): file path only
4348///
4349/// Stops accumulating output when the estimated token count exceeds
4350/// `max_tokens`.
4351#[must_use]
4352pub fn render(graph: &RepoGraph, max_tokens: usize, focus: Option<usize>) -> String {
4353    let n = graph.files.len();
4354    if n == 0 {
4355        return String::new();
4356    }
4357
4358    // Compute ranks (recompute topic-sensitive if focus is given)
4359    let ranks = if focus.is_some() {
4360        pagerank(n, &graph.edges, focus)
4361    } else {
4362        graph.base_ranks.clone()
4363    };
4364
4365    // Sort file indices by rank descending
4366    let mut sorted: Vec<usize> = (0..n).collect();
4367    sorted.sort_by(|&a, &b| ranks[b].total_cmp(&ranks[a]));
4368
4369    let mut output = String::new();
4370    let mut used_tokens = 0;
4371    let max_chars = max_tokens * CHARS_PER_TOKEN;
4372
4373    for (rank_pos, &file_idx) in sorted.iter().enumerate() {
4374        if used_tokens >= max_tokens {
4375            break;
4376        }
4377
4378        let file = &graph.files[file_idx];
4379        let score = ranks[file_idx];
4380        #[expect(clippy::cast_precision_loss, reason = "file counts fit in f32")]
4381        let percentile = (rank_pos as f32) / (n as f32);
4382
4383        let section = if percentile < 0.1 {
4384            render_tier0(graph, file_idx, file, score)
4385        } else if percentile < 0.3 {
4386            render_tier1(file, score)
4387        } else if percentile < 0.7 {
4388            render_tier2(file, score)
4389        } else {
4390            render_tier3(file)
4391        };
4392
4393        let section_chars = section.len();
4394        if used_tokens > 0 && used_tokens + section_chars / CHARS_PER_TOKEN > max_tokens {
4395            // Would exceed budget — try to fit at least the path
4396            let path_line = format!("{}\n", file.path);
4397            let path_tokens = path_line.len() / CHARS_PER_TOKEN;
4398            if used_tokens + path_tokens <= max_tokens {
4399                output.push_str(&path_line);
4400            }
4401            break;
4402        }
4403
4404        output.push_str(&section);
4405        used_tokens = output.len().min(max_chars) / CHARS_PER_TOKEN;
4406    }
4407
4408    output
4409}
4410
4411/// Render tier 0: full detail with callers, callees, and signatures.
4412fn render_tier0(graph: &RepoGraph, file_idx: usize, file: &FileNode, score: f32) -> String {
4413    let mut out = format!("## {} (rank: {score:.4})\n", file.path);
4414
4415    // Callers
4416    if file_idx < graph.callers.len() && !graph.callers[file_idx].is_empty() {
4417        let _ = write!(out, "  called by: ");
4418        let names: Vec<&str> = graph.callers[file_idx]
4419            .iter()
4420            .filter_map(|&idx| graph.files.get(idx as usize).map(|f| f.path.as_str()))
4421            .collect();
4422        let _ = writeln!(out, "{}", names.join(", "));
4423    }
4424
4425    // Callees
4426    if file_idx < graph.callees.len() && !graph.callees[file_idx].is_empty() {
4427        let _ = write!(out, "  calls: ");
4428        let names: Vec<&str> = graph.callees[file_idx]
4429            .iter()
4430            .filter_map(|&idx| graph.files.get(idx as usize).map(|f| f.path.as_str()))
4431            .collect();
4432        let _ = writeln!(out, "{}", names.join(", "));
4433    }
4434
4435    // Definitions with scope and signature
4436    for def in &file.defs {
4437        let scope_prefix = if def.scope.is_empty() {
4438            String::new()
4439        } else {
4440            format!("{} > ", def.scope)
4441        };
4442        if let Some(sig) = &def.signature {
4443            let _ = writeln!(out, "  {scope_prefix}{} {sig}", def.kind);
4444        } else {
4445            let _ = writeln!(out, "  {scope_prefix}{} {}", def.kind, def.name);
4446        }
4447    }
4448    let _ = writeln!(out);
4449    out
4450}
4451
4452/// Render tier 1: file path, rank, and signatures.
4453fn render_tier1(file: &FileNode, score: f32) -> String {
4454    let mut out = format!("## {} (rank: {score:.4})\n", file.path);
4455    for def in &file.defs {
4456        if let Some(sig) = &def.signature {
4457            let _ = writeln!(out, "  {sig}");
4458        } else {
4459            let _ = writeln!(out, "  {} {}", def.kind, def.name);
4460        }
4461    }
4462    let _ = writeln!(out);
4463    out
4464}
4465
4466/// Render tier 2: file path, rank, and definition names/kinds.
4467fn render_tier2(file: &FileNode, score: f32) -> String {
4468    let mut out = format!("{} (rank: {score:.4})", file.path);
4469    if !file.defs.is_empty() {
4470        let names: Vec<String> = file
4471            .defs
4472            .iter()
4473            .map(|d| format!("{}:{}", d.kind, d.name))
4474            .collect();
4475        let _ = write!(out, " -- {}", names.join(", "));
4476    }
4477    let _ = writeln!(out);
4478    out
4479}
4480
4481/// Render tier 3: file path only.
4482fn render_tier3(file: &FileNode) -> String {
4483    format!("{}\n", file.path)
4484}
4485
4486// ── JSON rendering ───────────────────────────────────────────────────
4487
4488/// Build the `lsp_location` for a file itself (line 0).
4489fn file_lsp_location(path: &str) -> RepoMapLspLocation {
4490    RepoMapLspLocation {
4491        file_path: if path.starts_with("./") || path.starts_with('/') {
4492            path.to_string()
4493        } else {
4494            format!("./{path}")
4495        },
4496        start_line: 0,
4497        start_character: 0,
4498        end_line: 0,
4499        end_character: 0,
4500    }
4501}
4502
4503/// Infer `ContentKind` from a file path's extension.
4504fn content_kind_for_path(path: &str) -> ContentKind {
4505    let ext = std::path::Path::new(path)
4506        .extension()
4507        .and_then(|e| e.to_str())
4508        .unwrap_or("");
4509    ContentKind::from_extension(ext)
4510}
4511
4512/// Minimum byte envelope reserved for each included file.
4513///
4514/// Even a file with zero symbols takes JSON overhead for path, rank, arrays,
4515/// etc. Calibrated against actual serde_json output for an empty `RepoMapFile`:
4516/// `{"lsp_location":{"file_path":"./src/file_N.rs","start_line":0,"start_character":0,`
4517/// `"end_line":0,"end_character":0},"rank":0.1234,"content_kind":"code",`
4518/// `"calls":[],"symbols":[],"truncated_symbols":0,"truncated_calls":0}` ≈ 250 bytes.
4519///
4520/// This floor prevents the budget allocator from giving a file so little space
4521/// that it can emit no envelope at all.
4522const FILE_ENVELOPE_MIN_BYTES: usize = 250;
4523
4524/// Minimum useful payload for an admitted file: envelope plus room for at
4525/// least 2-3 typical-sized symbols. Files whose fair share cannot meet this
4526/// floor are excluded entirely (Fix A, 4.0.2). Without this guard, low-rank
4527/// tail files consume budget on envelopes that contain no symbols or calls,
4528/// crowding out content for the top files.
4529const FILE_MIN_USEFUL_BYTES: usize = 600;
4530
4531/// Fraction of each file's per-file budget reserved for outgoing-call edges
4532/// after the envelope is paid. The remaining (1 - this) fraction goes to
4533/// symbols. Symbol leftover flows into calls; call leftover flows to the
4534/// next file. (Fix C, 4.0.2 — without a reserve, the symbol loop saturates
4535/// the per-file budget and calls always come up empty.)
4536///
4537/// This fraction is the FLOOR: if the stored callees' exact byte cost exceeds
4538/// `post_envelope * CALLS_BUDGET_FRACTION`, the calls reserve is raised to the
4539/// exact cost (capped at `CALLS_MAX_FRACTION`). See B-0059 / 4.1.10.
4540const CALLS_BUDGET_FRACTION: f64 = 0.30;
4541
4542/// Maximum fraction of `post_envelope` that may be reserved for calls.
4543///
4544/// Prevents calls from starving symbols entirely when there are many callees
4545/// with long paths. `CALLS_BUDGET_FRACTION` sets a floor; this sets a ceiling.
4546/// Together they implement the precise-reserve strategy (B-0059 / 4.1.10):
4547/// the reserve is `clamp(exact_calls_bytes, fraction_floor, fraction_ceil)`.
4548const CALLS_MAX_FRACTION: f64 = 0.70;
4549
4550/// Maximum fraction of the total budget that a single file may claim.
4551///
4552/// Without this cap a single very-high-rank file (e.g. `lib.rs`) could
4553/// consume the entire budget, leaving all other files empty.
4554const MAX_FILE_SHARE: f64 = 0.40;
4555
4556/// AST kind priority for orientation-style symbol ordering. Higher = surface
4557/// earlier. Used when def-level PageRank is degenerate (most ranks near zero)
4558/// to fall back on structural signal rather than noise.
4559///
4560/// The intuition: a reader orienting in a codebase wants to see the file's
4561/// *shape* before its *behaviors*. Types declare shape; functions declare
4562/// behavior; fields and constants are internal detail. This ordering matches
4563/// how humans read code top-down. (Fix B, 4.0.2.)
4564fn ast_kind_priority(kind: &str) -> u32 {
4565    match kind {
4566        // Tier 3: shape — what THIS file is
4567        "trait_item" | "interface" | "trait" => 30,
4568        "struct_item" | "struct" | "class_definition" | "class" => 29,
4569        "enum_item" | "enum" => 28,
4570        "type_item" | "type_alias_declaration" | "type_alias" => 27,
4571        "mod_item" | "module" | "namespace" => 26,
4572        // Tier 2: behavior — what THIS file does
4573        "function_item" | "function_definition" | "function" | "method_definition" => 20,
4574        "impl_item" | "impl" => 19,
4575        // Tier 1: declarations
4576        "const_item" | "const_declaration" | "const" => 10,
4577        "static_item" | "static" => 9,
4578        // Tier 0: internals (fields, variables, parameters)
4579        _ => 0,
4580    }
4581}
4582
4583/// Effective AST priority with corpus-relative rank promotion (4.0.4).
4584///
4585/// Preserves the 4.0.2 AST-priority ordering by default (types first,
4586/// then functions, then fields). When a def's PageRank significantly
4587/// exceeds the corpus median, promotes it up one or two tiers so that
4588/// load-bearing defs surface alongside their declared-tier neighbors.
4589///
4590/// Thresholds are corpus-median multiples (self-calibrating):
4591/// - rank > 4× median       → +1 tier (e.g., hot function joins type tier)
4592/// - rank > 16× median      → +2 tiers (extremely hot def)
4593/// - otherwise              → declared tier preserved
4594///
4595/// On degenerate (flat) rank distributions the median equals the floor,
4596/// nothing crosses threshold, and 4.0.2 AST-priority ordering is fully
4597/// preserved. On informative distributions (post-4.0.3 enrichment),
4598/// hot defs surface proportionally.
4599fn effective_priority(kind: &str, def_rank: f32, promo_1: f32, promo_2: f32) -> u32 {
4600    let base = ast_kind_priority(kind);
4601    // Accumulate promotion tiers as a plain integer to satisfy clippy's
4602    // bool_to_int_with_if lint while preserving branch clarity.
4603    let promo_tiers: u32 = u32::from(def_rank > promo_1) + u32::from(def_rank > promo_2);
4604    // Tier spacing matches ast_kind_priority's 10-unit gaps.
4605    base + promo_tiers * 10
4606}
4607
4608/// Estimate the serialised JSON byte cost of one `RepoMapSymbol`.
4609///
4610/// Calibrated against actual serde_json output. A `RepoMapSymbol` serialises to
4611/// approximately:
4612/// `{"name":"<N>","kind":<K>,"lsp_location":{"file_path":"<P>","start_line":0,`
4613/// `"start_character":0,"end_line":0,"end_character":0},"rank":<R>}`
4614///
4615/// That is ~165 bytes of overhead (braces, keys, fixed-width integers, rank)
4616/// plus the name length and file_path length. We pass the path length
4617/// separately because the path is the same for all symbols in one file.
4618fn estimate_symbol_bytes(name: &str) -> usize {
4619    // 165 bytes overhead + name length.
4620    // The file_path is not included here because it is part of the
4621    // envelope cost accounted separately.
4622    165 + name.len()
4623}
4624
4625/// Estimate the serialised JSON byte cost of one `RepoMapCall`.
4626///
4627/// Each call entry: `{"lsp_location":{"file_path":"<P>","start_line":0,`
4628/// `"start_character":0,"end_line":0,"end_character":0},"rank":<R>}`
4629/// ≈ 120 bytes overhead + path length.
4630fn estimate_call_bytes(target_path: &str) -> usize {
4631    120 + target_path.len()
4632}
4633
4634/// Render a `PageRank`-weighted JSON map with token-budget allocation (4.0.1).
4635///
4636/// # Algorithm
4637///
4638/// **Step 1 — File-share allocation.** Each eligible file receives a byte
4639/// budget proportional to its `base_rank`. The share is capped at 40% of
4640/// `budget_total_bytes` and floored at [`FILE_ENVELOPE_MIN_BYTES`] (200 B).
4641/// Files are included in rank order until the cumulative allocation would
4642/// exceed the total budget.
4643///
4644/// **Step 2 — Per-file symbol fill.** For each included file, symbols are
4645/// walked in def-rank descending order. Inclusion continues until either (a)
4646/// the file's budget share is exhausted (with carry-over of leftover bytes to
4647/// the next file) or (b) a logarithmic attenuation cutoff fires: symbol at
4648/// position `i` (0-based) is included only if its rank ≥ `top_rank /
4649/// (1 + ln(i + 1))`. `calls[]` is filled in target-file base-rank order
4650/// subject to a hard [`MAX_FILE_CALLS`] render cap (no attenuation — I#68,
4651/// 4.1.4) and per-file byte budget. `truncated_symbols` and `truncated_calls`
4652/// track the count of omitted entries.
4653///
4654/// **Step 3 — Response telemetry.** The response includes `estimated_bytes`
4655/// (actual returned content size), `budget_bytes` (`token_budget * 4`),
4656/// and `budget_exhausted` (`total_files > files.len()`).
4657///
4658/// # Arguments
4659///
4660/// - `graph` — the built dependency graph.
4661/// - `token_budget` — caller-specified token budget (× 4 = byte budget).
4662/// - `focus` — optional file index for topic-sensitive `PageRank`.
4663/// - `include_metadata` — when `false` (default), Meta-classified files
4664///   are excluded before ranking.
4665#[must_use]
4666#[expect(
4667    clippy::cast_precision_loss,
4668    reason = "rank sums and counts are small f32/f64; precision loss is acceptable"
4669)]
4670#[expect(
4671    clippy::too_many_lines,
4672    reason = "the three-step allocation algorithm (file-share → symbol-fill → calls-fill) \
4673              is sequential and share state; splitting into helpers would require passing \
4674              mutable slices across three boundaries with no clarity gain"
4675)]
4676pub fn render_json_budgeted(
4677    graph: &RepoGraph,
4678    token_budget: usize,
4679    focus: Option<usize>,
4680    include_metadata: bool,
4681) -> GetRepoMapResponse {
4682    let n = graph.files.len();
4683    if n == 0 {
4684        let budget_bytes = token_budget * CHARS_PER_TOKEN;
4685        return GetRepoMapResponse {
4686            files: vec![],
4687            total_files: 0,
4688            estimated_bytes: 0,
4689            budget_bytes,
4690            budget_exhausted: false,
4691            capped: false,
4692        };
4693    }
4694
4695    let budget_total_bytes = token_budget * CHARS_PER_TOKEN;
4696
4697    // Recompute topic-sensitive ranks if focus is given.
4698    let ranks = if focus.is_some() {
4699        pagerank(n, &graph.edges, focus)
4700    } else {
4701        graph.base_ranks.clone()
4702    };
4703
4704    // Sort all file indices by rank descending.
4705    let mut sorted: Vec<usize> = (0..n).collect();
4706    sorted.sort_by(|&a, &b| ranks[b].total_cmp(&ranks[a]));
4707
4708    // Apply metadata exclusion filter.
4709    let eligible: Vec<usize> = if include_metadata {
4710        sorted
4711    } else {
4712        sorted
4713            .into_iter()
4714            .filter(|&idx| {
4715                let kind = content_kind_for_path(&graph.files[idx].path);
4716                kind != ContentKind::Meta
4717            })
4718            .collect()
4719    };
4720
4721    let total_files = eligible.len();
4722
4723    // ── Corpus-median def-rank thresholds for tier promotion (4.0.4) ────────
4724    //
4725    // Compute once per call (corpus-wide, not per-file) so the threshold is
4726    // self-calibrating: flat distributions (all ranks equal) set median = floor
4727    // and nothing crosses threshold; informative distributions see proportional
4728    // promotion. Using corpus-wide median ensures a hot function in one file is
4729    // judged against the entire corpus, not just its local file peers.
4730    // Use the 75th percentile of nonzero def-ranks as the corpus reference value
4731    // for tier promotion (rather than the 50th percentile / median). The 75th
4732    // percentile is more robust: on a flat distribution most defs cluster near the
4733    // floor, so the 75th percentile is only marginally above the floor (making the
4734    // 4× threshold very selective). On an informative distribution (post-4.0.3
4735    // call-edge enrichment) the 75th percentile is meaningfully above the floor,
4736    // so the same 4× multiplier captures genuinely hot defs without falsely
4737    // promoting slightly-above-floor helpers.
4738    //
4739    // The 50th percentile (lower median) was rejected because on a 10-def corpus
4740    // with max/min ratio 5× the median equals the floor, causing the 4× threshold
4741    // to fire on defs that are only 5× above floor (a low-variance corpus). The
4742    // 75th percentile corrects this without requiring hand-tuned per-corpus magic
4743    // numbers.
4744    let corpus_reference_rank: f32 = {
4745        let mut nonzero: Vec<f32> = graph
4746            .def_ranks
4747            .iter()
4748            .copied()
4749            .filter(|r| *r > 0.0)
4750            .collect();
4751        if nonzero.is_empty() {
4752            0.0
4753        } else {
4754            nonzero.sort_unstable_by(f32::total_cmp);
4755            let n = nonzero.len();
4756            // 75th percentile index: floor(0.75 * (n - 1))
4757            let idx = (3 * (n - 1)) / 4;
4758            nonzero[idx]
4759        }
4760    };
4761    let promo_1_threshold = corpus_reference_rank * 4.0; // +1 tier
4762    let promo_2_threshold = corpus_reference_rank * 16.0; // +2 tiers
4763
4764    // ── Step 1: File-share allocation ────────────────────────────────
4765
4766    // Greedily determine which files fit within the budget, computing each
4767    // file's share as it is added. We must run a two-pass approach:
4768    //   pass A: determine which files are included (cumulative sum check),
4769    //   pass B: fill symbols/calls using final per-file allocations.
4770    //
4771    // The "included" decision is based on the running cumulative sum so that
4772    // the leftover redistribution in step 2 can carry forward correctly.
4773
4774    // Floor-first admission (Fix A, 4.0.2):
4775    //
4776    // Cap admitted file count so each gets at least FILE_MIN_USEFUL_BYTES.
4777    // Below this threshold the response would carry envelopes that contain
4778    // no symbols or calls — pure overhead, no information. Concentrating
4779    // the budget on fewer files with real content is strictly better for
4780    // orientation than dropping envelope sentinels for many files.
4781    let max_admissible = budget_total_bytes / FILE_MIN_USEFUL_BYTES;
4782    let admit_count = eligible.len().min(max_admissible.max(1));
4783
4784    let budget_f64 = budget_total_bytes as f64;
4785
4786    // Pre-compute rank sum across ADMITTED files only (top-N by rank). f64
4787    // to avoid precision loss when summing many small f32 values.
4788    let admitted_rank_sum: f64 = eligible
4789        .iter()
4790        .take(admit_count)
4791        .map(|&idx| f64::from(ranks[idx]))
4792        .sum();
4793    let admitted_rank_sum = if admitted_rank_sum > 0.0 {
4794        admitted_rank_sum
4795    } else {
4796        1.0
4797    };
4798
4799    // Compute per-file budgets. Each admitted file gets at least
4800    // FILE_MIN_USEFUL_BYTES; the proportional-to-rank share is applied on
4801    // top of the floor and capped at MAX_FILE_SHARE.
4802    let mut included_indices: Vec<usize> = Vec::new(); // indices into `eligible`
4803    let mut file_budgets: Vec<usize> = Vec::new();
4804    let mut cumulative_budget: usize = 0;
4805
4806    for (i, &file_idx) in eligible.iter().take(admit_count).enumerate() {
4807        let file_rank = f64::from(ranks[file_idx]);
4808        let raw_share = budget_f64 * file_rank / admitted_rank_sum;
4809        let capped = raw_share.min(budget_f64 * MAX_FILE_SHARE);
4810        // `capped` is non-negative and bounded by budget_f64 (a usize).
4811        #[expect(
4812            clippy::cast_possible_truncation,
4813            clippy::cast_sign_loss,
4814            reason = "capped is non-negative and bounded by budget_total_bytes (a usize)"
4815        )]
4816        let budget_i = (capped as usize).max(FILE_MIN_USEFUL_BYTES);
4817
4818        if cumulative_budget + budget_i > budget_total_bytes && !included_indices.is_empty() {
4819            break;
4820        }
4821        cumulative_budget += budget_i;
4822        included_indices.push(i);
4823        file_budgets.push(budget_i);
4824    }
4825
4826    // ── Step 2: Per-file symbol fill ─────────────────────────────────
4827
4828    let mut result_files: Vec<RepoMapFile> = Vec::with_capacity(included_indices.len());
4829    let mut leftover: usize = 0; // unused bytes carried from previous file
4830
4831    for (slot, &eligible_i) in included_indices.iter().enumerate() {
4832        let file_idx = eligible[eligible_i];
4833        let file = &graph.files[file_idx];
4834        let file_rank = ranks[file_idx];
4835        let file_path_lsp = file_lsp_location(&file.path);
4836
4837        let budget_in = file_budgets[slot] + leftover;
4838
4839        // Pre-compute callee indices here (before the budget split) so the
4840        // precise byte cost of rendering all stored callees is available when
4841        // setting calls_reserve. This is the B-0059 fix (4.1.10): the old code
4842        // placed callee_indices after the symbol loop, so calls_reserve was
4843        // always the fraction-floor — even when the stored callees needed more.
4844        let callee_indices: Vec<usize> = if file_idx < graph.callees.len() {
4845            let mut callees: Vec<(usize, f32)> = graph.callees[file_idx]
4846                .iter()
4847                .filter_map(|&ci| {
4848                    let ci = ci as usize;
4849                    graph.files.get(ci).map(|_| {
4850                        let r = graph.base_ranks.get(ci).copied().unwrap_or(0.0);
4851                        (ci, r)
4852                    })
4853                })
4854                .collect();
4855            callees.sort_unstable_by(|a, b| b.1.total_cmp(&a.1));
4856            callees.into_iter().map(|(ci, _)| ci).collect()
4857        } else {
4858            vec![]
4859        };
4860
4861        // Reserve a fraction of the post-envelope budget for outgoing calls
4862        // (Fix C, 4.0.2). Without this guard the symbol loop saturates
4863        // `budget_in` and the calls loop always trips its byte-check.
4864        // Symbol leftover flows into calls; call leftover flows to the
4865        // next file via the outer `leftover` variable.
4866        //
4867        // B-0059 (4.1.10): raise the reserve to the exact byte cost of
4868        // rendering all stored callees (up to MAX_FILE_CALLS), clamped to
4869        // [fraction_floor, CALLS_MAX_FRACTION * post_envelope]. This ensures
4870        // that at any budget where calls physically fit, they are not crowded
4871        // out by symbols consuming the fraction-floor reserve.
4872        let post_envelope = budget_in.saturating_sub(FILE_ENVELOPE_MIN_BYTES);
4873        let precise_calls_bytes: usize = callee_indices
4874            .iter()
4875            .take(MAX_FILE_CALLS)
4876            .map(|&ci| estimate_call_bytes(&graph.files[ci].path))
4877            .sum();
4878        #[expect(
4879            clippy::cast_possible_truncation,
4880            clippy::cast_sign_loss,
4881            reason = "post_envelope * fraction is bounded by post_envelope (a usize); result is non-negative"
4882        )]
4883        let fraction_floor = (post_envelope as f64 * CALLS_BUDGET_FRACTION) as usize;
4884        #[expect(
4885            clippy::cast_possible_truncation,
4886            clippy::cast_sign_loss,
4887            reason = "post_envelope * fraction is bounded by post_envelope (a usize); result is non-negative"
4888        )]
4889        let fraction_ceil = (post_envelope as f64 * CALLS_MAX_FRACTION) as usize;
4890        let calls_reserve = precise_calls_bytes.max(fraction_floor).min(fraction_ceil);
4891        let symbols_budget = FILE_ENVELOPE_MIN_BYTES + post_envelope.saturating_sub(calls_reserve);
4892        let mut used: usize = FILE_ENVELOPE_MIN_BYTES; // envelope cost
4893
4894        // ── Symbols ──────────────────────────────────────────────────
4895        // Retrieve def-level ranks for this file via the offset table.
4896        let def_count = file.defs.len();
4897        let def_offset = if file_idx < graph.def_offsets.len() {
4898            graph.def_offsets[file_idx]
4899        } else {
4900            0
4901        };
4902
4903        // Build (def_idx, rank, kind_priority, start_byte) tuples. We sort
4904        // by a composite key: AST kind priority (descending) — putting types
4905        // before functions before fields — then by def_rank (descending)
4906        // within each tier. This is Fix B (4.0.2): the def_rank distribution
4907        // is often degenerate (most defs share near-zero rank because the
4908        // call-edge extractor doesn't capture every dispatch), so we use
4909        // structural signal as the primary ordering and def_rank as the
4910        // within-tier tiebreaker. When def_rank IS informative, it dominates
4911        // *within* its kind tier and recovers the original behavior; the AST
4912        // signal only shifts ordering *between* tiers.
4913        let mut def_rank_pairs: Vec<(usize, f32, u32, u32)> = (0..def_count)
4914            .map(|di| {
4915                let flat = def_offset + di;
4916                let r = graph.def_ranks.get(flat).copied().unwrap_or(0.0);
4917                // Store the ORIGINAL ast_kind_priority in the tuple (used by the
4918                // per-tier attenuation loop below). The sort comparator uses
4919                // effective_priority (which may be higher due to 4.0.4 promotion)
4920                // to reorder hot defs ahead of cold type-tier defs, while the
4921                // attenuation tier tracker continues to use the original AST tier
4922                // so the existing per-tier cutoff behaviour is preserved.
4923                let kind_prio = ast_kind_priority(&file.defs[di].kind);
4924                let decl_order = file.defs[di].start_byte;
4925                (di, r, kind_prio, decl_order)
4926            })
4927            .collect();
4928        def_rank_pairs.sort_unstable_by(|a, b| {
4929            // Primary: effective priority (4.0.4: AST kind + corpus-rank promotion) descending.
4930            // Hot defs that exceed corpus-median thresholds are promoted above their
4931            // declared tier so they surface before cold type-tier defs.
4932            let eff_a = effective_priority(
4933                &file.defs[a.0].kind,
4934                a.1,
4935                promo_1_threshold,
4936                promo_2_threshold,
4937            );
4938            let eff_b = effective_priority(
4939                &file.defs[b.0].kind,
4940                b.1,
4941                promo_1_threshold,
4942                promo_2_threshold,
4943            );
4944            eff_b
4945                .cmp(&eff_a)
4946                // Secondary: def_rank descending within tier.
4947                .then_with(|| b.1.total_cmp(&a.1))
4948                // Tertiary: earlier declaration order (stable, deterministic).
4949                .then_with(|| a.3.cmp(&b.3))
4950        });
4951
4952        let top_def_rank = def_rank_pairs.first().map(|&(_, r, _, _)| r).unwrap_or(0.0);
4953
4954        let mut symbols: Vec<RepoMapSymbol> = Vec::new();
4955        let mut truncated_symbols: usize = 0;
4956
4957        // Track per-tier position for the attenuation cutoff. When AST kind
4958        // priority changes (we've moved from types to functions, say), reset
4959        // the position so the attenuation curve restarts. Otherwise a
4960        // structurally-equivalent-but-later tier would be unfairly cut.
4961        let mut tier_pos: usize = 0;
4962        let mut current_tier: Option<u32> = None;
4963        let mut tier_top_rank: f32 = top_def_rank;
4964
4965        for (pos, &(di, def_r, kind_prio, _)) in def_rank_pairs.iter().enumerate() {
4966            // Reset attenuation at tier boundaries.
4967            if current_tier != Some(kind_prio) {
4968                current_tier = Some(kind_prio);
4969                tier_pos = 0;
4970                tier_top_rank = def_r;
4971            }
4972
4973            // Logarithmic attenuation cutoff, relative to the tier's top rank.
4974            let cutoff = if tier_top_rank > 0.0 {
4975                tier_top_rank / (1.0 + (tier_pos as f32 + 1.0).ln())
4976            } else {
4977                0.0
4978            };
4979            if def_r < cutoff {
4980                // Attenuation cuts the rest of THIS tier; we don't stop
4981                // entirely because the next tier may still have useful
4982                // content within its own attenuation curve. Skip this def.
4983                truncated_symbols += 1;
4984                tier_pos += 1;
4985                continue;
4986            }
4987
4988            let def = &file.defs[di];
4989            let sym_bytes = estimate_symbol_bytes(&def.name);
4990            // Use the reserved symbols sub-budget (Fix C) so calls aren't
4991            // starved when symbols would otherwise saturate budget_in.
4992            if used + sym_bytes > symbols_budget {
4993                truncated_symbols += def_rank_pairs.len() - pos;
4994                break;
4995            }
4996
4997            // C2 (4.1.1): Use the AST-computed lsp_kind_hint when available
4998            // (populated at parse time for Python decorated_definition nodes).
4999            // Fall back to the AST-less string mapping for all other kinds.
5000            let kind = def
5001                .lsp_kind_hint
5002                .unwrap_or_else(|| crate::languages::lsp_symbol_kind_for_node_kind(&def.kind));
5003            let line_0 = def.start_line.saturating_sub(1) as usize;
5004            symbols.push(RepoMapSymbol {
5005                name: def.name.clone(),
5006                kind,
5007                lsp_location: RepoMapLspLocation {
5008                    file_path: file_path_lsp.file_path.clone(),
5009                    start_line: line_0,
5010                    start_character: 0,
5011                    end_line: line_0,
5012                    end_character: 0,
5013                },
5014                rank: def_r,
5015            });
5016            used += sym_bytes;
5017            tier_pos += 1;
5018        }
5019
5020        // ── Calls ─────────────────────────────────────────────────────
5021        // `callee_indices` was pre-computed above (before the budget split) for
5022        // the precise-reserve calculation. Re-use it here.
5023        let call_total = callee_indices.len();
5024
5025        let mut calls: Vec<RepoMapCall> = Vec::new();
5026        let mut truncated_calls: usize = 0;
5027
5028        for (pos, &ci) in callee_indices.iter().enumerate() {
5029            let callee_rank = graph.base_ranks.get(ci).copied().unwrap_or(0.0);
5030
5031            // Hard render-time cap (I#68, 4.1.4): stop once we have rendered
5032            // MAX_FILE_CALLS entries. This is symmetric with the MAX_NEIGHBORS
5033            // graph-build cap (I#60) and replaces the old logarithmic attenuation
5034            // cutoff. Attenuation was correct for *symbols* (rank distributions
5035            // are informative) but pathological for *call edges*: in real corpora
5036            // callee base-ranks follow a geometric distribution, causing attenuation
5037            // to fire at pos=1 and collapse calls[] to a single entry.
5038            if calls.len() >= MAX_FILE_CALLS {
5039                truncated_calls += call_total - pos;
5040                break;
5041            }
5042
5043            let callee_path = &graph.files[ci].path;
5044            let call_bytes = estimate_call_bytes(callee_path);
5045            if used + call_bytes > budget_in {
5046                truncated_calls += call_total - pos;
5047                break;
5048            }
5049
5050            calls.push(RepoMapCall {
5051                lsp_location: file_lsp_location(callee_path),
5052                rank: callee_rank,
5053            });
5054            used += call_bytes;
5055        }
5056
5057        // Carry unused bytes forward to the next file.
5058        leftover = budget_in.saturating_sub(used);
5059
5060        result_files.push(RepoMapFile {
5061            lsp_location: file_path_lsp,
5062            rank: file_rank,
5063            content_kind: content_kind_tag(content_kind_for_path(&file.path)),
5064            calls,
5065            symbols,
5066            truncated_symbols,
5067            truncated_calls,
5068        });
5069    }
5070
5071    let estimated_bytes = serde_json::to_string(&result_files)
5072        .map(|s| s.len())
5073        .unwrap_or(0);
5074
5075    let budget_exhausted = total_files > result_files.len();
5076
5077    GetRepoMapResponse {
5078        files: result_files,
5079        total_files,
5080        estimated_bytes,
5081        budget_bytes: budget_total_bytes,
5082        budget_exhausted,
5083        capped: budget_exhausted,
5084    }
5085}
5086
5087/// Render a `PageRank`-sorted JSON map of the repository (4.0.0 compatibility shim).
5088///
5089/// This function wraps [`render_json_budgeted`] with a synthetic token budget
5090/// derived from `max_files * 2000` (a generous per-file allowance). It exists
5091/// to keep the existing D1/D2 unit tests compiling without change; the MCP
5092/// layer calls [`render_json_budgeted`] directly in 4.0.1.
5093///
5094/// The `capped` field in the response reflects whether the budget was
5095/// exhausted before all `eligible` files were included, which is equivalent
5096/// to the previous `total_files > max_files` check.
5097///
5098/// When `include_metadata` is `false` (default), files whose extension
5099/// classifies as [`ContentKind::Meta`] are excluded before ranking.
5100#[must_use]
5101pub fn render_json(
5102    graph: &RepoGraph,
5103    max_files: usize,
5104    focus: Option<usize>,
5105    include_metadata: bool,
5106) -> GetRepoMapResponse {
5107    // Synthesise a generous token budget: 2000 tokens per requested file.
5108    // This ensures the existing D1/D2 tests (which pass small max_files values
5109    // like 3, 5, 50) see the same file-count behaviour they expect. The test
5110    // assertions check file counts, not byte sizes, so the exact budget value
5111    // only matters for ensuring enough headroom.
5112    let token_budget = max_files.saturating_mul(2000);
5113    render_json_budgeted(graph, token_budget, focus, include_metadata)
5114}
5115
5116// ── Tests ────────────────────────────────────────────────────────────
5117
5118#[cfg(test)]
5119mod tests {
5120    use super::*;
5121
5122    #[test]
5123    fn test_pagerank_simple() {
5124        // 3-node graph: 0 -> 1 -> 2, 2 -> 0 (cycle)
5125        let edges = vec![(0, 1, 1), (1, 2, 1), (2, 0, 1)];
5126        let ranks = pagerank(3, &edges, None);
5127
5128        // All nodes in a symmetric cycle should have equal rank
5129        assert_eq!(ranks.len(), 3);
5130        let sum: f32 = ranks.iter().sum();
5131        assert!(
5132            (sum - 1.0).abs() < 0.01,
5133            "ranks should sum to ~1.0, got {sum}"
5134        );
5135
5136        // In a perfect cycle, all ranks should be approximately equal
5137        let expected = 1.0 / 3.0;
5138        for (i, &r) in ranks.iter().enumerate() {
5139            assert!(
5140                (r - expected).abs() < 0.05,
5141                "rank[{i}] = {r}, expected ~{expected}"
5142            );
5143        }
5144    }
5145
5146    #[test]
5147    fn test_pagerank_star() {
5148        // Star graph: 0,1,2 all point to 3
5149        let edges = vec![(0, 3, 1), (1, 3, 1), (2, 3, 1)];
5150        let ranks = pagerank(4, &edges, None);
5151
5152        assert_eq!(ranks.len(), 4);
5153        // Node 3 should have the highest rank
5154        let max_idx = ranks
5155            .iter()
5156            .enumerate()
5157            .max_by(|a, b| a.1.total_cmp(b.1))
5158            .unwrap()
5159            .0;
5160        assert_eq!(max_idx, 3, "node 3 should have highest rank");
5161        assert!(
5162            ranks[3] > ranks[0],
5163            "rank[3]={} should be > rank[0]={}",
5164            ranks[3],
5165            ranks[0]
5166        );
5167    }
5168
5169    #[test]
5170    fn test_pagerank_topic_sensitive() {
5171        // 10-node chain: 0 -> 1 -> ... -> 9.
5172        //
5173        // With PERSONALIZATION_ALPHA = 0.15 and n = 10, the uniform share per
5174        // node is 1/10 = 0.10.  The focus node (0) gets 0.15 teleportation
5175        // mass vs 0.10 uniform, so focused rank[0] > uniform rank[0] holds.
5176        //
5177        // The 3-node chain used previously broke when alpha was reduced from
5178        // 0.70 to 0.15 because 0.15 < 1/3 = 0.33 for n=3 — the focus node
5179        // received *less* teleportation than its uniform share, inverting the
5180        // expected direction.  Using n=10 avoids this edge case while still
5181        // testing the personalization effect.
5182        let n = 10_usize;
5183        #[expect(clippy::cast_possible_truncation, reason = "test: n << u32::MAX")]
5184        let edges: Vec<(u32, u32, u32)> = (0..(n - 1))
5185            .map(|i| (i as u32, (i + 1) as u32, 1_u32))
5186            .collect();
5187        let uniform_ranks = pagerank(n, &edges, None);
5188        let biased_ranks = pagerank(n, &edges, Some(0));
5189
5190        // With focus on node 0, it should get a higher rank than uniform
5191        // because PERSONALIZATION_ALPHA (0.15) > 1/n (0.10) for n=10.
5192        assert!(
5193            biased_ranks[0] > uniform_ranks[0],
5194            "focused rank[0]={} should be > uniform rank[0]={}",
5195            biased_ranks[0],
5196            uniform_ranks[0]
5197        );
5198    }
5199
5200    // ── J1 tests — topic-sensitive PageRank soft personalization ─────────
5201
5202    /// J1 RED: `focus_file` PageRank must not collapse other-file ranks.
5203    ///
5204    /// Baseline (pre-4.0.5) concentrated 70% mass on the focus node, producing
5205    /// a degenerate Dirac delta: focus rank ≈ 0.703, all others ≈ 0.003.
5206    /// This test fails on the baseline and must pass after the fix.
5207    ///
5208    /// Invariant: with `PERSONALIZATION_ALPHA = 0.15`, focus node gets 0.15 of
5209    /// teleportation mass and each of the other (n-1) nodes gets 0.85/(n-1).
5210    /// On a star graph with n=10 nodes, the focus node rank must NOT be more
5211    /// than 40× the average non-focus rank.  The 4.0.5 fix targets roughly
5212    /// 5-10× for a well-connected graph, so 40× is a conservative upper bound
5213    /// that the baseline (≈200×) fails.
5214    #[test]
5215    fn test_focus_file_topic_pagerank_preserves_rank_dispersion() {
5216        // Star graph: nodes 1..9 all point to node 0 (high natural rank).
5217        // Focus on node 1 (low natural rank) to test personalization effect.
5218        let n = 10_usize;
5219        #[expect(clippy::cast_possible_truncation, reason = "test: n << u32::MAX")]
5220        let edges: Vec<(u32, u32, u32)> = (1..n).map(|i| (i as u32, 0_u32, 1_u32)).collect();
5221
5222        let ranks_focused = pagerank(n, &edges, Some(1));
5223
5224        let focus_rank = ranks_focused[1];
5225        let sum_non_focus: f32 = ranks_focused
5226            .iter()
5227            .enumerate()
5228            .filter(|&(i, _)| i != 1)
5229            .map(|(_, &r)| r)
5230            .sum();
5231        let n_non_focus = (n - 1) as f32;
5232        let avg_non_focus = sum_non_focus / n_non_focus;
5233
5234        let dispersion_ratio = focus_rank / avg_non_focus;
5235
5236        eprintln!(
5237            "J1 dispersion: focus_rank={focus_rank:.6}, avg_non_focus={avg_non_focus:.6}, \
5238             ratio={dispersion_ratio:.2}× (must be <= 40×)"
5239        );
5240
5241        // With 0.15 personalization alpha the focus node's teleportation
5242        // advantage is modest; 40× is an upper bound the old 0.70 code violates.
5243        assert!(
5244            dispersion_ratio <= 40.0,
5245            "focus rank is {dispersion_ratio:.1}× avg non-focus rank (must be ≤ 40×); \
5246             pre-fix baseline was ~200× due to 70% concentration — I#16"
5247        );
5248
5249        // Ranks must still sum to ~1.
5250        let total: f32 = ranks_focused.iter().sum();
5251        assert!(
5252            (total - 1.0).abs() < 0.01,
5253            "ranks must sum to ≈1.0; got {total}"
5254        );
5255    }
5256
5257    /// J1 RED: focus node must have the highest rank (it still gets the bias),
5258    /// but non-focus nodes must NOT collapse to a flat floor.
5259    ///
5260    /// Concretely: the second-highest-ranked file must be ≥ 10% of the focus
5261    /// file's rank (neighborhood rebiasing, not winner-take-all).
5262    #[test]
5263    fn test_focus_file_topic_pagerank_does_not_collapse_other_files() {
5264        // Linear chain: 0 → 1 → 2 → ... → 9 (directed).
5265        // Focus on node 0.  Without personalization, ranks decrease along the
5266        // chain.  With soft personalization the non-focus nodes stay non-trivial.
5267        let n = 10_usize;
5268        #[expect(clippy::cast_possible_truncation, reason = "test: n << u32::MAX")]
5269        let edges: Vec<(u32, u32, u32)> = (0..(n - 1))
5270            .map(|i| (i as u32, (i + 1) as u32, 1_u32))
5271            .collect();
5272
5273        let ranks = pagerank(n, &edges, Some(0));
5274
5275        let focus_rank = ranks[0];
5276        // All non-focus ranks must be ≥ 10% of focus rank.
5277        for (i, &r) in ranks.iter().enumerate().skip(1) {
5278            assert!(
5279                r >= focus_rank * 0.10,
5280                "rank[{i}] = {r:.6} is < 10% of focus rank {focus_rank:.6}; \
5281                 non-focus files must not collapse to near-zero (I#16)"
5282            );
5283        }
5284    }
5285
5286    // ── J2 tests — neighborhood count parity ─────────────────────────────
5287
5288    /// J2 RED: `render_json_budgeted` with `focus=Some(i)` must return at
5289    /// least 70% as many files as the unfocused call with the same budget.
5290    ///
5291    /// Baseline (pre-4.0.5, α=0.70) collapsed the focused run to 1 dominant
5292    /// file + a flat tail — that's what this test was originally written to
5293    /// catch. The threshold was 80% when α was 0.15; raising α to 0.35
5294    /// (4.1.12, to satisfy the flask focus-in-top-3 invariant) pulls budget
5295    /// toward the focus neighborhood and drops file count to ~70% on small
5296    /// synthetic stars. 70% remains far from the "1 file + zero tail"
5297    /// degenerate baseline this test guards against.
5298    #[test]
5299    fn test_focus_file_returns_neighborhood_not_just_focus() {
5300        // Build a 12-file star graph with meaningful rank variation.
5301        let n = 12_usize;
5302        #[expect(clippy::cast_possible_truncation, reason = "test: n << u32::MAX")]
5303        let edges: Vec<(u32, u32, u32)> = (1..n).map(|i| (i as u32, 0_u32, 1_u32)).collect();
5304        let base_ranks = pagerank(n, &edges, None);
5305        let (callers, callees) = build_neighbor_lists(n, &edges);
5306
5307        let file_nodes: Vec<FileNode> = (0..n)
5308            .map(|i| FileNode {
5309                path: format!("src/file_{i}.rs"),
5310                defs: vec![Definition {
5311                    name: format!("func_{i}"),
5312                    kind: "function_item".to_string(),
5313                    start_line: 1,
5314                    end_line: 5,
5315                    scope: String::new(),
5316                    signature: Some(format!("fn func_{i}() -> i32")),
5317                    start_byte: 0,
5318                    end_byte: 100,
5319                    calls: vec![],
5320                    decorator: None,
5321                    lsp_kind_hint: None,
5322                }],
5323                imports: vec![],
5324            })
5325            .collect();
5326
5327        let graph = RepoGraph {
5328            files: file_nodes,
5329            edges,
5330            base_ranks,
5331            callers,
5332            callees,
5333            def_edges: vec![],
5334            def_ranks: vec![],
5335            def_callers: vec![],
5336            def_callees: vec![],
5337            def_offsets: vec![0],
5338            alpha: 0.5,
5339        };
5340
5341        let budget = 2000; // generous budget; all 12 files should fit
5342        let unfocused = render_json_budgeted(&graph, budget, None, false);
5343        let focused = render_json_budgeted(&graph, budget, Some(1), false);
5344
5345        let unfocused_n = unfocused.files.len();
5346        let focused_n = focused.files.len();
5347        #[expect(
5348            clippy::cast_possible_truncation,
5349            clippy::cast_sign_loss,
5350            reason = "unfocused_n is a file count (small, positive); f32 multiplication \
5351                      by 0.80 and ceil produce a value in [0, n]; truncation to usize is safe"
5352        )]
5353        let min_expected = (unfocused_n as f32 * 0.70).ceil() as usize;
5354
5355        eprintln!(
5356            "J2 neighborhood: unfocused={unfocused_n} files, focused={focused_n} files \
5357             (need ≥ {min_expected})"
5358        );
5359
5360        assert!(
5361            focused_n >= min_expected,
5362            "focused call returned {focused_n} files; expected ≥ {min_expected} \
5363             (70% of unfocused {unfocused_n}); soft personalization must preserve \
5364             rank dispersion across files (I#16/J2)"
5365        );
5366    }
5367
5368    /// J2 RED: topic delta fingerprinting — focused run must reorder files
5369    /// relative to unfocused run (focus file surfaces near top), but both
5370    /// must contain similar total file counts.
5371    #[test]
5372    fn test_focus_delta_topic_fingerprinting_works() {
5373        // Bidirectional 8-file ring so all nodes are structurally equivalent.
5374        // Without focus all ranks are equal.  With focus on node 3, node 3
5375        // must surface as the highest-ranked file.
5376        let n = 8_usize;
5377        #[expect(clippy::cast_possible_truncation, reason = "test: n << u32::MAX")]
5378        let edges: Vec<(u32, u32, u32)> = (0..n)
5379            .flat_map(|i| {
5380                let next = ((i + 1) % n) as u32;
5381                let curr = i as u32;
5382                [(curr, next, 1_u32), (next, curr, 1_u32)]
5383            })
5384            .collect();
5385
5386        let ranks_uniform = pagerank(n, &edges, None);
5387        let ranks_focused = pagerank(n, &edges, Some(3));
5388
5389        // Focus node must have highest rank.
5390        let top_idx = ranks_focused
5391            .iter()
5392            .enumerate()
5393            .max_by(|a, b| a.1.total_cmp(b.1))
5394            .map(|(i, _)| i)
5395            .unwrap();
5396
5397        assert_eq!(
5398            top_idx, 3,
5399            "with focus=Some(3), node 3 must have highest rank; top was {top_idx}"
5400        );
5401
5402        // Uniform baseline: all ranks should be approximately equal.
5403        let uniform_max = ranks_uniform
5404            .iter()
5405            .copied()
5406            .fold(f32::NEG_INFINITY, f32::max);
5407        let uniform_min = ranks_uniform.iter().copied().fold(f32::INFINITY, f32::min);
5408        assert!(
5409            (uniform_max - uniform_min).abs() < 0.01,
5410            "on a ring without focus all ranks should be ≈equal; max={uniform_max:.6} min={uniform_min:.6}"
5411        );
5412
5413        // Focused run must rank the focus node significantly higher than others
5414        // but others must remain non-trivial (≥ 5% of focus).
5415        let focus_rank = ranks_focused[3];
5416        for (i, &r) in ranks_focused.iter().enumerate().filter(|&(i, _)| i != 3) {
5417            assert!(
5418                r >= focus_rank * 0.05,
5419                "rank[{i}]={r:.6} is < 5% of focus rank {focus_rank:.6}; \
5420                 soft personalization must preserve non-focus ranks"
5421            );
5422        }
5423    }
5424
5425    // ── T1 tests — focus_file resolver normalization (I#20) ──────────────
5426
5427    /// Build a tiny synthetic graph whose `FileNode::path` values match the
5428    /// shape `build_graph` produces on disk (no leading `./`, forward slashes).
5429    fn focus_resolver_graph() -> RepoGraph {
5430        let file_nodes: Vec<FileNode> = vec![
5431            FileNode {
5432                path: "device_opt/services/storage.py".to_string(),
5433                defs: vec![],
5434                imports: vec![],
5435            },
5436            FileNode {
5437                path: "device_opt/ui/textual/screens/settings.py".to_string(),
5438                defs: vec![],
5439                imports: vec![],
5440            },
5441            FileNode {
5442                path: "device_opt/services/registry.py".to_string(),
5443                defs: vec![],
5444                imports: vec![],
5445            },
5446            FileNode {
5447                path: "tests/test_storage.py".to_string(),
5448                defs: vec![],
5449                imports: vec![],
5450            },
5451        ];
5452        let n = file_nodes.len();
5453        RepoGraph {
5454            files: file_nodes,
5455            edges: vec![],
5456            base_ranks: vec![1.0 / n as f32; n],
5457            callers: vec![vec![]; n],
5458            callees: vec![vec![]; n],
5459            def_edges: vec![],
5460            def_ranks: vec![],
5461            def_callers: vec![],
5462            def_callees: vec![],
5463            def_offsets: vec![0; n + 1],
5464            alpha: 0.5,
5465        }
5466    }
5467
5468    /// T1: focus_file paths emitted by `lsp_location.file_path` (with the
5469    /// `./` prefix) must resolve to the correct file index.
5470    ///
5471    /// Baseline reproduction (mnemosyne corpus, 4.0.5): passing
5472    /// `focus_file="./device_opt/.../settings.py"` produced rank values
5473    /// bit-identical to the unfocused call because the strict-suffix matcher
5474    /// in `tools.rs` failed both the `exact` and the `strict_suffix` checks
5475    /// when the focus carried the LSP `./` prefix. The matcher silently
5476    /// returned `focus = None`, masking the failure as "topic-sensitive
5477    /// PageRank does nothing on Python".
5478    #[test]
5479    fn test_focus_file_resolver_accepts_lsp_location_path() {
5480        let g = focus_resolver_graph();
5481        // LSP-shaped path with leading `./` — the form documented in
5482        // get_repo_map's instructions.
5483        let res = g.resolve_focus_file("./device_opt/ui/textual/screens/settings.py");
5484        match res {
5485            FocusResolution::Found(idx) => {
5486                assert_eq!(
5487                    g.files[idx].path, "device_opt/ui/textual/screens/settings.py",
5488                    "resolver must accept the ./-prefixed LSP path form (I#20)"
5489                );
5490            }
5491            FocusResolution::NotFound | FocusResolution::Ambiguous(_) => {
5492                panic!(
5493                    "resolver returned {res:?} for ./device_opt/ui/textual/screens/settings.py; \
5494                     the LSP-shaped path form must resolve to exactly one file (I#20)"
5495                );
5496            }
5497        }
5498    }
5499
5500    /// T1: the bare stored path (no `./`) must continue to resolve.
5501    /// Regression guard for the pre-fix matcher's "exact" path.
5502    #[test]
5503    fn test_focus_file_resolver_accepts_bare_stored_path() {
5504        let g = focus_resolver_graph();
5505        let res = g.resolve_focus_file("device_opt/services/storage.py");
5506        match res {
5507            FocusResolution::Found(idx) => {
5508                assert_eq!(g.files[idx].path, "device_opt/services/storage.py");
5509            }
5510            other => panic!("expected Found, got {other:?}"),
5511        }
5512    }
5513
5514    /// T1: strict-suffix match — `storage.py` must match
5515    /// `device_opt/services/storage.py` (prev char is `/`) but ambiguity
5516    /// (two `storage.py` files) must be reported, not silently picked.
5517    #[test]
5518    fn test_focus_file_resolver_strict_suffix_and_ambiguity() {
5519        let g = focus_resolver_graph();
5520        // "storage.py" matches both device_opt/services/storage.py and
5521        // tests/test_storage.py? No — test_storage.py has `_` before `storage.py`
5522        // (not `/`), so the strict-suffix matcher rejects it. Only one match.
5523        let res = g.resolve_focus_file("storage.py");
5524        assert!(
5525            matches!(res, FocusResolution::Found(_)),
5526            "strict-suffix `storage.py` must match exactly one file (the `_` in \
5527             test_storage.py blocks the strict-suffix), got {res:?}"
5528        );
5529        // Add a second services/storage.py-shaped file to force ambiguity.
5530        let mut g2 = g.clone();
5531        g2.files.push(FileNode {
5532            path: "vendored/services/storage.py".to_string(),
5533            defs: vec![],
5534            imports: vec![],
5535        });
5536        g2.base_ranks.push(0.0);
5537        g2.callers.push(vec![]);
5538        g2.callees.push(vec![]);
5539        g2.def_offsets.push(*g2.def_offsets.last().unwrap());
5540        let res = g2.resolve_focus_file("storage.py");
5541        match res {
5542            FocusResolution::Ambiguous(cands) => {
5543                assert_eq!(cands.len(), 2, "expected two candidates, got {cands:?}");
5544            }
5545            other => panic!("expected Ambiguous, got {other:?}"),
5546        }
5547    }
5548
5549    /// T1: a focus that matches no file returns `NotFound`. The caller
5550    /// is responsible for either treating this as unfocused or surfacing
5551    /// an error — the resolver itself does not impose policy.
5552    #[test]
5553    fn test_focus_file_resolver_not_found() {
5554        let g = focus_resolver_graph();
5555        let res = g.resolve_focus_file("./does/not/exist.py");
5556        assert!(
5557            matches!(res, FocusResolution::NotFound),
5558            "expected NotFound, got {res:?}"
5559        );
5560    }
5561
5562    /// T1: empty focus does not match anything (avoids the empty-suffix
5563    /// degenerate that would otherwise match every file).
5564    #[test]
5565    fn test_focus_file_resolver_empty_input_is_not_found() {
5566        let g = focus_resolver_graph();
5567        let res = g.resolve_focus_file("");
5568        assert!(
5569            matches!(res, FocusResolution::NotFound),
5570            "empty focus must not match anything, got {res:?}"
5571        );
5572    }
5573
5574    /// T1: focus_file rank delta must be visible on a Python-shaped
5575    /// synthetic graph.
5576    ///
5577    /// Builds a small Python-style call graph (FileNode + Definition with
5578    /// resolved CallRefs, matching what `extract_calls` produces on a real
5579    /// Python corpus), runs `build_graph_from_files_pub` to get a
5580    /// `RepoGraph`, then calls `render_json_budgeted` with and without
5581    /// focus. Asserts that the focused call changes the rank of at least
5582    /// one non-focus file by ≥ 5% in either direction.
5583    ///
5584    /// On the baseline (pre-T1) this test passes when the caller supplies
5585    /// an int focus_idx directly — the engine's topic-sensitive PageRank is
5586    /// correct. The bug was at the string-to-int resolver layer in
5587    /// `tools.rs`, which silently masked the failure as "the rendering
5588    /// path doesn't propagate focus". This test locks the engine's
5589    /// behavior so a future regression in the rendering path is caught.
5590    #[test]
5591    #[expect(
5592        clippy::too_many_lines,
5593        reason = "synthetic Python-shaped graph (five FileNodes with defs + \
5594                  CallRefs + ImportRefs) plus the two-call assertion sequence \
5595                  is inherently long; splitting into helpers would obscure the \
5596                  one-shot reproduction the test is locking in."
5597    )]
5598    fn test_focus_file_rank_delta_visible_on_python_corpus() {
5599        // Five files, Python-shaped: services/storage.py (a "hub" that two
5600        // UI files call into) plus a tests/ file. The Python tree-sitter
5601        // extractor produces `class_definition` and `function_definition`
5602        // kinds with resolved CallRefs pointing at the hub.
5603        let mut files: Vec<FileNode> = vec![
5604            FileNode {
5605                path: "device_opt/services/storage.py".to_string(),
5606                defs: vec![
5607                    Definition {
5608                        name: "ScanStore".to_string(),
5609                        kind: "class_definition".to_string(),
5610                        start_line: 1,
5611                        end_line: 80,
5612                        scope: String::new(),
5613                        signature: None,
5614                        start_byte: 0,
5615                        end_byte: 2000,
5616                        calls: vec![],
5617                        decorator: None,
5618                        lsp_kind_hint: None,
5619                    },
5620                    Definition {
5621                        name: "save_scan".to_string(),
5622                        kind: "function_definition".to_string(),
5623                        start_line: 20,
5624                        end_line: 40,
5625                        scope: "class_definition ScanStore".to_string(),
5626                        signature: Some("def save_scan(self, scan)".to_string()),
5627                        start_byte: 200,
5628                        end_byte: 600,
5629                        calls: vec![],
5630                        decorator: None,
5631                        lsp_kind_hint: None,
5632                    },
5633                ],
5634                imports: vec![],
5635            },
5636            FileNode {
5637                path: "device_opt/services/registry.py".to_string(),
5638                defs: vec![Definition {
5639                    name: "register".to_string(),
5640                    kind: "function_definition".to_string(),
5641                    start_line: 1,
5642                    end_line: 30,
5643                    scope: String::new(),
5644                    signature: Some("def register(svc)".to_string()),
5645                    start_byte: 0,
5646                    end_byte: 600,
5647                    calls: vec![CallRef {
5648                        name: "save_scan".to_string(),
5649                        qualified_path: None,
5650                        receiver_type: None,
5651                        byte_offset: 100,
5652                        resolved: None,
5653                    }],
5654                    decorator: None,
5655                    lsp_kind_hint: None,
5656                }],
5657                imports: vec![ImportRef {
5658                    raw_path: "from device_opt.services import storage".to_string(),
5659                    resolved_idx: Some(0),
5660                }],
5661            },
5662            FileNode {
5663                path: "device_opt/ui/screens/browse.py".to_string(),
5664                defs: vec![Definition {
5665                    name: "browse_scans".to_string(),
5666                    kind: "function_definition".to_string(),
5667                    start_line: 1,
5668                    end_line: 50,
5669                    scope: String::new(),
5670                    signature: Some("def browse_scans(app)".to_string()),
5671                    start_byte: 0,
5672                    end_byte: 1000,
5673                    calls: vec![CallRef {
5674                        name: "save_scan".to_string(),
5675                        qualified_path: None,
5676                        receiver_type: None,
5677                        byte_offset: 200,
5678                        resolved: None,
5679                    }],
5680                    decorator: None,
5681                    lsp_kind_hint: None,
5682                }],
5683                imports: vec![ImportRef {
5684                    raw_path: "from device_opt.services import storage".to_string(),
5685                    resolved_idx: Some(0),
5686                }],
5687            },
5688            FileNode {
5689                path: "device_opt/ui/screens/settings.py".to_string(),
5690                defs: vec![Definition {
5691                    name: "open_settings".to_string(),
5692                    kind: "function_definition".to_string(),
5693                    start_line: 1,
5694                    end_line: 40,
5695                    scope: String::new(),
5696                    signature: Some("def open_settings(app)".to_string()),
5697                    start_byte: 0,
5698                    end_byte: 800,
5699                    calls: vec![CallRef {
5700                        name: "register".to_string(),
5701                        qualified_path: None,
5702                        receiver_type: None,
5703                        byte_offset: 150,
5704                        resolved: None,
5705                    }],
5706                    decorator: None,
5707                    lsp_kind_hint: None,
5708                }],
5709                imports: vec![ImportRef {
5710                    raw_path: "from device_opt.services import registry".to_string(),
5711                    resolved_idx: Some(1),
5712                }],
5713            },
5714            FileNode {
5715                path: "tests/test_storage.py".to_string(),
5716                defs: vec![Definition {
5717                    name: "test_save".to_string(),
5718                    kind: "function_definition".to_string(),
5719                    start_line: 1,
5720                    end_line: 20,
5721                    scope: String::new(),
5722                    signature: Some("def test_save()".to_string()),
5723                    start_byte: 0,
5724                    end_byte: 400,
5725                    calls: vec![CallRef {
5726                        name: "save_scan".to_string(),
5727                        qualified_path: None,
5728                        receiver_type: None,
5729                        byte_offset: 50,
5730                        resolved: None,
5731                    }],
5732                    decorator: None,
5733                    lsp_kind_hint: None,
5734                }],
5735                imports: vec![ImportRef {
5736                    raw_path: "from device_opt.services import storage".to_string(),
5737                    resolved_idx: Some(0),
5738                }],
5739            },
5740        ];
5741
5742        // Resolve calls so the graph builder has edges to chew on.
5743        let def_index = build_def_index(&files);
5744        resolve_calls(&mut files, &def_index, &HashMap::new());
5745        let graph = build_graph_from_files_pub(files);
5746
5747        // Sanity: the graph must have edges (the calls were resolved).
5748        assert!(
5749            !graph.edges.is_empty(),
5750            "Python-shaped synthetic graph must produce file-level edges; got 0. \
5751             The CallRefs may have failed to resolve."
5752        );
5753
5754        // Resolve the focus file via the new helper.
5755        let focus_idx = match graph.resolve_focus_file("./device_opt/ui/screens/settings.py") {
5756            FocusResolution::Found(i) => i,
5757            other => panic!("resolver must find settings.py via LSP-shaped path, got {other:?}"),
5758        };
5759
5760        let budget = 4000;
5761        let unfocused = render_json_budgeted(&graph, budget, None, false);
5762        let focused = render_json_budgeted(&graph, budget, Some(focus_idx), false);
5763
5764        // Collect rank-by-path maps for both runs.
5765        let unfocused_ranks: std::collections::HashMap<String, f32> = unfocused
5766            .files
5767            .iter()
5768            .map(|f| (f.lsp_location.file_path.clone(), f.rank))
5769            .collect();
5770        let focused_ranks: std::collections::HashMap<String, f32> = focused
5771            .files
5772            .iter()
5773            .map(|f| (f.lsp_location.file_path.clone(), f.rank))
5774            .collect();
5775
5776        eprintln!("T1 Python — unfocused ranks: {unfocused_ranks:#?}");
5777        eprintln!("T1 Python — focused ranks:   {focused_ranks:#?}");
5778
5779        // Find at least one non-focus file whose rank changed by ≥ 5% in
5780        // either direction. The threshold is conservative; the soft 0.15
5781        // personalization alpha redistributes mass enough that on this
5782        // 5-node graph the affected neighbors typically shift by 20%+.
5783        let focus_path = "./device_opt/ui/screens/settings.py";
5784        let mut max_delta_ratio = 0.0_f32;
5785        for (path, &u_rank) in &unfocused_ranks {
5786            if path == focus_path {
5787                continue;
5788            }
5789            if let Some(&f_rank) = focused_ranks.get(path)
5790                && u_rank > 0.0
5791            {
5792                let ratio = (f_rank - u_rank).abs() / u_rank;
5793                if ratio > max_delta_ratio {
5794                    max_delta_ratio = ratio;
5795                }
5796            }
5797        }
5798        assert!(
5799            max_delta_ratio >= 0.05,
5800            "focus_file must rebias non-focus file ranks by ≥ 5%; \
5801             max observed delta ratio = {max_delta_ratio:.3} \
5802             (I#20: focus_file invisible on Python corpora)"
5803        );
5804
5805        // Bit-identity guard: at least one non-focus file's rank must NOT
5806        // equal its unfocused value. This is the pathology from the
5807        // mnemosyne reproduction: every rank value was bit-identical
5808        // across global/focused calls.
5809        let any_changed = unfocused_ranks.iter().any(|(path, &u_rank)| {
5810            path != focus_path
5811                && focused_ranks
5812                    .get(path)
5813                    .is_some_and(|&f_rank| f_rank.to_bits() != u_rank.to_bits())
5814        });
5815        assert!(
5816            any_changed,
5817            "no non-focus file rank changed across focused/unfocused calls — \
5818             bit-identical pathology (I#20). unfocused={unfocused_ranks:#?} \
5819             focused={focused_ranks:#?}"
5820        );
5821    }
5822
5823    /// T1: focus_file rank delta on a Rust-shaped synthetic graph.
5824    ///
5825    /// Regression test: confirms the engine's topic-sensitive PageRank
5826    /// works on Rust shapes (where T1's investigation found it already
5827    /// works, but the resolver fix must not break the existing path).
5828    ///
5829    /// This complements `test_focus_file_returns_neighborhood_not_just_focus`
5830    /// by additionally checking that (a) the resolver accepts a Rust path
5831    /// with the `./` LSP prefix, and (b) at least one non-focus file's
5832    /// rank moves by ≥ 5%.
5833    #[test]
5834    #[expect(
5835        clippy::too_many_lines,
5836        reason = "synthetic Rust-shaped graph with four FileNodes plus the \
5837                  two-call assertion sequence inherently exceeds the 100-line \
5838                  cap; the test mirrors the Python-shaped sibling."
5839    )]
5840    fn test_focus_file_rank_delta_preserved_on_rust_corpus() {
5841        let mut files: Vec<FileNode> = vec![
5842            FileNode {
5843                path: "src/lib.rs".to_string(),
5844                defs: vec![Definition {
5845                    name: "Engine".to_string(),
5846                    kind: "struct_item".to_string(),
5847                    start_line: 1,
5848                    end_line: 30,
5849                    scope: String::new(),
5850                    signature: None,
5851                    start_byte: 0,
5852                    end_byte: 600,
5853                    calls: vec![],
5854                    decorator: None,
5855                    lsp_kind_hint: None,
5856                }],
5857                imports: vec![],
5858            },
5859            FileNode {
5860                path: "src/encoder/mod.rs".to_string(),
5861                defs: vec![Definition {
5862                    name: "encode".to_string(),
5863                    kind: "function_item".to_string(),
5864                    start_line: 1,
5865                    end_line: 40,
5866                    scope: String::new(),
5867                    signature: Some("fn encode(input: &str) -> Vec<f32>".to_string()),
5868                    start_byte: 0,
5869                    end_byte: 800,
5870                    calls: vec![],
5871                    decorator: None,
5872                    lsp_kind_hint: None,
5873                }],
5874                imports: vec![ImportRef {
5875                    raw_path: "use crate::lib;".to_string(),
5876                    resolved_idx: Some(0),
5877                }],
5878            },
5879            FileNode {
5880                path: "src/search.rs".to_string(),
5881                defs: vec![Definition {
5882                    name: "search".to_string(),
5883                    kind: "function_item".to_string(),
5884                    start_line: 1,
5885                    end_line: 30,
5886                    scope: String::new(),
5887                    signature: Some("fn search(q: &str) -> Hits".to_string()),
5888                    start_byte: 0,
5889                    end_byte: 600,
5890                    calls: vec![CallRef {
5891                        name: "encode".to_string(),
5892                        qualified_path: None,
5893                        receiver_type: None,
5894                        byte_offset: 100,
5895                        resolved: None,
5896                    }],
5897                    decorator: None,
5898                    lsp_kind_hint: None,
5899                }],
5900                imports: vec![ImportRef {
5901                    raw_path: "use crate::encoder;".to_string(),
5902                    resolved_idx: Some(1),
5903                }],
5904            },
5905            FileNode {
5906                path: "src/cli.rs".to_string(),
5907                defs: vec![Definition {
5908                    name: "main".to_string(),
5909                    kind: "function_item".to_string(),
5910                    start_line: 1,
5911                    end_line: 20,
5912                    scope: String::new(),
5913                    signature: Some("fn main()".to_string()),
5914                    start_byte: 0,
5915                    end_byte: 400,
5916                    calls: vec![CallRef {
5917                        name: "search".to_string(),
5918                        qualified_path: None,
5919                        receiver_type: None,
5920                        byte_offset: 50,
5921                        resolved: None,
5922                    }],
5923                    decorator: None,
5924                    lsp_kind_hint: None,
5925                }],
5926                imports: vec![ImportRef {
5927                    raw_path: "use crate::search;".to_string(),
5928                    resolved_idx: Some(2),
5929                }],
5930            },
5931        ];
5932
5933        let def_index = build_def_index(&files);
5934        resolve_calls(&mut files, &def_index, &HashMap::new());
5935        let graph = build_graph_from_files_pub(files);
5936
5937        assert!(
5938            !graph.edges.is_empty(),
5939            "Rust-shaped synthetic graph must produce edges"
5940        );
5941
5942        let focus_idx = match graph.resolve_focus_file("./src/encoder/mod.rs") {
5943            FocusResolution::Found(i) => i,
5944            other => panic!("resolver must find encoder/mod.rs via LSP path, got {other:?}"),
5945        };
5946
5947        let budget = 4000;
5948        let unfocused = render_json_budgeted(&graph, budget, None, false);
5949        let focused = render_json_budgeted(&graph, budget, Some(focus_idx), false);
5950
5951        let unfocused_ranks: std::collections::HashMap<String, f32> = unfocused
5952            .files
5953            .iter()
5954            .map(|f| (f.lsp_location.file_path.clone(), f.rank))
5955            .collect();
5956        let focused_ranks: std::collections::HashMap<String, f32> = focused
5957            .files
5958            .iter()
5959            .map(|f| (f.lsp_location.file_path.clone(), f.rank))
5960            .collect();
5961
5962        eprintln!("T1 Rust — unfocused: {unfocused_ranks:#?}");
5963        eprintln!("T1 Rust — focused:   {focused_ranks:#?}");
5964
5965        let focus_path = "./src/encoder/mod.rs";
5966        let mut max_delta_ratio = 0.0_f32;
5967        for (path, &u_rank) in &unfocused_ranks {
5968            if path == focus_path {
5969                continue;
5970            }
5971            if let Some(&f_rank) = focused_ranks.get(path)
5972                && u_rank > 0.0
5973            {
5974                let ratio = (f_rank - u_rank).abs() / u_rank;
5975                if ratio > max_delta_ratio {
5976                    max_delta_ratio = ratio;
5977                }
5978            }
5979        }
5980        assert!(
5981            max_delta_ratio >= 0.05,
5982            "focus_file must rebias non-focus file ranks by ≥ 5% on Rust shapes; \
5983             max observed delta = {max_delta_ratio:.3}"
5984        );
5985    }
5986
5987    #[test]
5988    fn test_pagerank_empty() {
5989        let ranks = pagerank(0, &[], None);
5990        assert!(ranks.is_empty());
5991    }
5992
5993    #[test]
5994    fn test_render_tiers() {
5995        // Build a small graph with 10 files to exercise all tiers
5996        let files: Vec<FileNode> = (0..10)
5997            .map(|i| FileNode {
5998                path: format!("src/file_{i}.rs"),
5999                defs: vec![Definition {
6000                    name: format!("func_{i}"),
6001                    kind: "function_item".to_string(),
6002                    start_line: 1,
6003                    end_line: 5,
6004                    scope: String::new(),
6005                    signature: Some(format!("func_{i}(x: i32) -> i32")),
6006                    start_byte: 0,
6007                    end_byte: 0,
6008                    calls: vec![],
6009                    decorator: None,
6010                    lsp_kind_hint: None,
6011                }],
6012                imports: vec![],
6013            })
6014            .collect();
6015
6016        // Create a star graph: files 1-9 all import from file 0
6017        let edges: Vec<(u32, u32, u32)> = (1..10).map(|i| (i, 0, 1)).collect();
6018        let base_ranks = pagerank(10, &edges, None);
6019        let (top_callers, top_callees) = build_neighbor_lists(10, &edges);
6020
6021        let graph = RepoGraph {
6022            files,
6023            edges,
6024            base_ranks,
6025            callers: top_callers,
6026            callees: top_callees,
6027            def_edges: vec![],
6028            def_ranks: vec![],
6029            def_callers: vec![],
6030            def_callees: vec![],
6031            def_offsets: vec![0],
6032            alpha: 0.5,
6033        };
6034
6035        // Large budget: should include all files
6036        let full = render(&graph, 10_000, None);
6037        assert!(
6038            full.contains("file_0"),
6039            "output should contain the top-ranked file"
6040        );
6041        // file_0 should appear as tier 0 (highest rank)
6042        assert!(
6043            full.contains("## src/file_0.rs"),
6044            "top file should have tier 0 heading"
6045        );
6046
6047        // Tiny budget: should only fit a few files
6048        let small = render(&graph, 10, None);
6049        assert!(
6050            !small.is_empty(),
6051            "even tiny budget should produce some output"
6052        );
6053        // Should have fewer entries than full render
6054        let full_lines = full.lines().count();
6055        let small_lines = small.lines().count();
6056        assert!(
6057            small_lines < full_lines,
6058            "small budget ({small_lines} lines) should have fewer lines than full ({full_lines})"
6059        );
6060    }
6061
6062    #[test]
6063    fn test_render_empty_graph() {
6064        let graph = RepoGraph {
6065            files: vec![],
6066            edges: vec![],
6067            base_ranks: vec![],
6068            callers: vec![],
6069            callees: vec![],
6070            def_edges: vec![],
6071            def_ranks: vec![],
6072            def_callers: vec![],
6073            def_callees: vec![],
6074            def_offsets: vec![0],
6075            alpha: 0.5,
6076        };
6077        let output = render(&graph, 1000, None);
6078        assert!(output.is_empty(), "empty graph should render empty string");
6079    }
6080
6081    #[test]
6082    fn test_build_graph_on_fixtures() {
6083        let fixtures = Path::new(env!("CARGO_MANIFEST_DIR"))
6084            .parent()
6085            .unwrap()
6086            .parent()
6087            .unwrap()
6088            .join("tests")
6089            .join("fixtures");
6090
6091        let graph = build_graph(&fixtures).expect("build_graph should succeed on fixtures");
6092
6093        // Should find at least the 3 fixture files
6094        assert!(
6095            !graph.files.is_empty(),
6096            "graph should contain files from fixtures"
6097        );
6098
6099        // Should find definitions in the Rust fixture
6100        let rs_file = graph.files.iter().find(|f| f.path.ends_with("sample.rs"));
6101        assert!(rs_file.is_some(), "should find sample.rs");
6102        let rs_file = rs_file.unwrap();
6103        assert!(
6104            !rs_file.defs.is_empty(),
6105            "sample.rs should have definitions"
6106        );
6107        assert!(
6108            rs_file.defs.iter().any(|d| d.name == "hello"),
6109            "should find 'hello' function in sample.rs"
6110        );
6111
6112        // Should find definitions in the Python fixture
6113        let py_file = graph.files.iter().find(|f| f.path.ends_with("sample.py"));
6114        assert!(py_file.is_some(), "should find sample.py");
6115        let py_file = py_file.unwrap();
6116        assert!(
6117            !py_file.defs.is_empty(),
6118            "sample.py should have definitions"
6119        );
6120        assert!(
6121            py_file.defs.iter().any(|d| d.name == "greet"),
6122            "should find 'greet' function in sample.py"
6123        );
6124
6125        // PageRank scores should be computed
6126        assert_eq!(graph.base_ranks.len(), graph.files.len());
6127        let sum: f32 = graph.base_ranks.iter().sum();
6128        assert!(
6129            (sum - 1.0).abs() < 0.01,
6130            "PageRank scores should sum to ~1.0, got {sum}"
6131        );
6132    }
6133
6134    #[test]
6135    fn test_extract_imports_rust() {
6136        let source = "use crate::foo::bar;\nuse std::collections::HashMap;\n";
6137        let (lang, query) = import_query_for_extension("rs").unwrap();
6138        let imports = extract_imports(source, &lang, &query);
6139        assert_eq!(imports.len(), 2);
6140        assert!(imports[0].contains("crate::foo::bar"));
6141    }
6142
6143    #[test]
6144    fn test_extract_imports_python_stub() {
6145        let source = "from typing import Protocol\nimport pkg.types\n";
6146        let (lang, query) = import_query_for_extension("pyi").unwrap();
6147        let imports = extract_imports(source, &lang, &query);
6148        assert_eq!(imports.len(), 2);
6149        assert!(imports[0].contains("from typing import Protocol"));
6150        assert!(imports[1].contains("import pkg.types"));
6151    }
6152
6153    #[test]
6154    fn test_resolve_python_import_to_stub_file() {
6155        let root = PathBuf::from("/project");
6156        let mut file_index = HashMap::new();
6157        file_index.insert(PathBuf::from("/project/pkg/types.pyi"), 1);
6158
6159        let result = resolve_python_import("import pkg.types", &root, &file_index);
6160        assert_eq!(result, Some(1));
6161    }
6162
6163    #[test]
6164    fn test_resolve_rust_crate_import() {
6165        let root = PathBuf::from("/project");
6166        let file_path = PathBuf::from("/project/src/main.rs");
6167        let mut file_index = HashMap::new();
6168        file_index.insert(PathBuf::from("/project/src/foo/bar.rs"), 1);
6169        file_index.insert(PathBuf::from("/project/src/main.rs"), 0);
6170
6171        let result = resolve_rust_import("use crate::foo::bar;", &file_path, &root, &file_index);
6172        assert_eq!(result, Some(1));
6173    }
6174
6175    #[test]
6176    fn test_resolve_rust_external_crate_dropped() {
6177        let root = PathBuf::from("/project");
6178        let file_path = PathBuf::from("/project/src/main.rs");
6179        let file_index = HashMap::new();
6180
6181        let result = resolve_rust_import(
6182            "use std::collections::HashMap;",
6183            &file_path,
6184            &root,
6185            &file_index,
6186        );
6187        assert_eq!(result, None, "external crate imports should be dropped");
6188    }
6189
6190    #[test]
6191    fn test_neighbor_lists() {
6192        // 0 -> 1, 0 -> 2, 1 -> 2
6193        let edges = vec![(0, 1, 1), (0, 2, 1), (1, 2, 1)];
6194        let (incoming, outgoing) = build_neighbor_lists(3, &edges);
6195
6196        // Node 2 should be called by 0 and 1
6197        assert!(incoming[2].contains(&0));
6198        assert!(incoming[2].contains(&1));
6199
6200        // Node 0 should call 1 and 2
6201        assert!(outgoing[0].contains(&1));
6202        assert!(outgoing[0].contains(&2));
6203    }
6204
6205    /// G1 (R2.3 issue a): A scoped call `mod_a::foo()` must store:
6206    /// - `name = "foo"` (bare identifier, for def-index lookup)
6207    /// - `qualified_path = Some("mod_a::foo")` (full path, for disambiguation)
6208    ///
6209    /// Before G1, `name` stored the full `"mod_a::foo"` path. After G1, `name`
6210    /// is always the bare trailing identifier and `qualified_path` carries the
6211    /// full path when the call is scoped.
6212    #[test]
6213    fn test_scoped_identifier_calls_preserve_path() {
6214        use crate::languages;
6215        use streaming_iterator::StreamingIterator as _;
6216
6217        let source = "
6218mod mod_a {
6219    pub fn foo() {}
6220}
6221mod mod_b {
6222    pub fn foo() {}
6223}
6224fn caller() {
6225    mod_a::foo();
6226    mod_b::foo();
6227}
6228";
6229        let call_config =
6230            languages::call_query_for_extension("rs").expect("Rust call config must exist");
6231        let lang_config =
6232            languages::config_for_extension("rs").expect("Rust lang config must exist");
6233
6234        let mut defs = {
6235            let mut parser = tree_sitter::Parser::new();
6236            parser.set_language(&lang_config.language).unwrap();
6237            let tree = parser.parse(source, None).unwrap();
6238            let mut cursor = tree_sitter::QueryCursor::new();
6239            let mut out = Vec::new();
6240            let mut matches =
6241                cursor.matches(&lang_config.query, tree.root_node(), source.as_bytes());
6242            while let Some(m) = matches.next() {
6243                let mut name = String::new();
6244                let mut def_node = None;
6245                for cap in m.captures {
6246                    let cname = &lang_config.query.capture_names()[cap.index as usize];
6247                    if *cname == "name" {
6248                        name = source[cap.node.start_byte()..cap.node.end_byte()].to_string();
6249                    } else if *cname == "def" {
6250                        def_node = Some(cap.node);
6251                    }
6252                }
6253                if let Some(node) = def_node {
6254                    #[expect(clippy::cast_possible_truncation)]
6255                    out.push(Definition {
6256                        name,
6257                        kind: node.kind().to_string(),
6258                        start_line: node.start_position().row as u32 + 1,
6259                        end_line: node.end_position().row as u32 + 1,
6260                        scope: String::new(),
6261                        signature: None,
6262                        start_byte: node.start_byte() as u32,
6263                        end_byte: node.end_byte() as u32,
6264                        calls: vec![],
6265                        decorator: None,
6266                        lsp_kind_hint: None,
6267                    });
6268                }
6269            }
6270            out
6271        };
6272
6273        extract_calls(source, &call_config, &mut defs);
6274
6275        // Find the `caller` function definition
6276        let caller_def = defs
6277            .iter()
6278            .find(|d| d.name == "caller")
6279            .expect("caller def");
6280
6281        // G1: bare name is "foo", qualified_path carries the module path.
6282        let call_names: Vec<&str> = caller_def.calls.iter().map(|c| c.name.as_str()).collect();
6283        let qualified_paths: Vec<Option<&str>> = caller_def
6284            .calls
6285            .iter()
6286            .map(|c| c.qualified_path.as_deref())
6287            .collect();
6288
6289        // Bare names must be the trailing identifier only.
6290        assert!(
6291            call_names.contains(&"foo"),
6292            "bare name 'foo' must appear for scoped calls; got: {call_names:?}"
6293        );
6294        // Qualified paths must carry the full scope.
6295        assert!(
6296            qualified_paths.contains(&Some("mod_a::foo")),
6297            "qualified_path 'mod_a::foo' must appear; got: {qualified_paths:?}"
6298        );
6299        assert!(
6300            qualified_paths.contains(&Some("mod_b::foo")),
6301            "qualified_path 'mod_b::foo' must appear; got: {qualified_paths:?}"
6302        );
6303        // Full paths must NOT appear in bare names.
6304        assert!(
6305            !call_names.contains(&"mod_a::foo"),
6306            "full path 'mod_a::foo' must not appear in bare name; got: {call_names:?}"
6307        );
6308    }
6309
6310    /// RED test (R2.3 issue b+c): Two defs named `Read` in different modules,
6311    /// an unqualified call to `Read`. Resolution must NOT silently pick the first.
6312    /// Either both are returned (ambiguous) or none.
6313    #[test]
6314    fn test_ambiguous_name_resolution_returns_all_or_none() {
6315        // Build two FileNodes each with a def named "Read", then a third with an
6316        // unqualified call to "Read".
6317        let file_a = FileNode {
6318            path: "mod_a.rs".to_string(),
6319            defs: vec![Definition {
6320                name: "Read".to_string(),
6321                kind: "trait_item".to_string(),
6322                start_line: 1,
6323                end_line: 3,
6324                scope: String::new(),
6325                signature: None,
6326                start_byte: 0,
6327                end_byte: 50,
6328                calls: vec![],
6329                decorator: None,
6330                lsp_kind_hint: None,
6331            }],
6332            imports: vec![],
6333        };
6334        let file_b = FileNode {
6335            path: "mod_b.rs".to_string(),
6336            defs: vec![Definition {
6337                name: "Read".to_string(),
6338                kind: "trait_item".to_string(),
6339                start_line: 1,
6340                end_line: 3,
6341                scope: String::new(),
6342                signature: None,
6343                start_byte: 0,
6344                end_byte: 50,
6345                calls: vec![],
6346                decorator: None,
6347                lsp_kind_hint: None,
6348            }],
6349            imports: vec![],
6350        };
6351        let file_c = FileNode {
6352            path: "caller.rs".to_string(),
6353            defs: vec![Definition {
6354                name: "do_thing".to_string(),
6355                kind: "function_item".to_string(),
6356                start_line: 1,
6357                end_line: 5,
6358                scope: String::new(),
6359                signature: None,
6360                start_byte: 0,
6361                end_byte: 100,
6362                calls: vec![CallRef {
6363                    name: "Read".to_string(),
6364                    qualified_path: None,
6365                    receiver_type: None,
6366                    byte_offset: 10,
6367                    resolved: None,
6368                }],
6369                decorator: None,
6370                lsp_kind_hint: None,
6371            }],
6372            imports: vec![],
6373        };
6374
6375        let mut files = vec![file_a, file_b, file_c];
6376        let def_index = build_def_index(&files);
6377        resolve_calls(&mut files, &def_index, &HashMap::new());
6378
6379        // The unqualified call to "Read" is ambiguous (two candidates, neither in same
6380        // file nor imported). Resolution must leave it as None — silent first-wins is wrong.
6381        let resolved = files[2].defs[0].calls[0].resolved;
6382        assert_eq!(
6383            resolved, None,
6384            "ambiguous unqualified call with no import context must resolve to None, not silently pick first"
6385        );
6386    }
6387
6388    // ── D1 / D2 tests ────────────────────────────────────────────────
6389
6390    /// Build a small test graph with N files and an optional JSON-extension file.
6391    fn build_test_graph(n_code: usize, include_json: bool) -> (RepoGraph, Vec<usize>) {
6392        let mut file_nodes: Vec<FileNode> = (0..n_code)
6393            .map(|i| FileNode {
6394                path: format!("src/file_{i}.rs"),
6395                defs: vec![
6396                    Definition {
6397                        name: format!("func_{i}"),
6398                        kind: "function_item".to_string(),
6399                        start_line: 1,
6400                        end_line: 5,
6401                        scope: String::new(),
6402                        signature: Some(format!("fn func_{i}() -> i32")),
6403                        start_byte: 0,
6404                        end_byte: 100,
6405                        calls: vec![],
6406                        decorator: None,
6407                        lsp_kind_hint: None,
6408                    },
6409                    Definition {
6410                        name: format!("MyStruct{i}"),
6411                        kind: "struct_item".to_string(),
6412                        start_line: 7,
6413                        end_line: 10,
6414                        scope: String::new(),
6415                        signature: None,
6416                        start_byte: 110,
6417                        end_byte: 200,
6418                        calls: vec![],
6419                        decorator: None,
6420                        lsp_kind_hint: None,
6421                    },
6422                ],
6423                imports: vec![],
6424            })
6425            .collect();
6426
6427        let json_idx = if include_json {
6428            let idx = file_nodes.len();
6429            file_nodes.push(FileNode {
6430                path: "data/config.json".to_string(),
6431                defs: vec![],
6432                imports: vec![],
6433            });
6434            vec![idx]
6435        } else {
6436            vec![]
6437        };
6438
6439        // Build a star graph: all code files point to file_0.
6440        let n = file_nodes.len();
6441        #[expect(clippy::cast_possible_truncation, reason = "test: n_code << u32::MAX")]
6442        let edges: Vec<(u32, u32, u32)> = (1..n_code).map(|i| (i as u32, 0, 1)).collect();
6443
6444        let base_ranks = pagerank(n, &edges, None);
6445        let (callers, callees) = build_neighbor_lists(n, &edges);
6446
6447        let graph = RepoGraph {
6448            files: file_nodes,
6449            edges,
6450            base_ranks,
6451            callers,
6452            callees,
6453            def_edges: vec![],
6454            def_ranks: vec![],
6455            def_callers: vec![],
6456            def_callees: vec![],
6457            def_offsets: vec![0],
6458            alpha: 0.5,
6459        };
6460
6461        (graph, json_idx)
6462    }
6463
6464    /// D1: `render_json` returns a `GetRepoMapResponse` with a `files` array.
6465    ///
6466    /// On the baseline (before D1) `get_repo_map_ripvec` returned markdown prose via
6467    /// `repo_map::render`; no `files` key existed in the output.
6468    #[test]
6469    fn get_repo_map_returns_json_with_files_array() {
6470        let (graph, _) = build_test_graph(5, false);
6471        let response = render_json(&graph, 50, None, false);
6472        assert!(
6473            !response.files.is_empty(),
6474            "files array should be non-empty for a non-empty graph"
6475        );
6476        // Serialize and verify the JSON shape has a `files` key.
6477        let json = serde_json::to_string(&response).expect("serialize");
6478        let parsed: serde_json::Value = serde_json::from_str(&json).expect("parse");
6479        assert!(
6480            parsed["files"].is_array(),
6481            "serialized response must have a `files` JSON array; got: {parsed}"
6482        );
6483    }
6484
6485    /// D1: every file entry has an `lsp_location` field.
6486    ///
6487    /// Before D1, output was prose text; no `lsp_location` existed anywhere in the response.
6488    #[test]
6489    fn get_repo_map_each_file_has_lsp_location() {
6490        let (graph, _) = build_test_graph(5, false);
6491        let response = render_json(&graph, 50, None, false);
6492        for file in &response.files {
6493            assert!(
6494                !file.lsp_location.file_path.is_empty(),
6495                "each file must have a non-empty lsp_location.file_path"
6496            );
6497        }
6498        // Also verify through JSON.
6499        let json = serde_json::to_string(&response).expect("serialize");
6500        let parsed: serde_json::Value = serde_json::from_str(&json).expect("parse");
6501        for entry in parsed["files"].as_array().expect("files array") {
6502            assert!(
6503                entry["lsp_location"]["file_path"].is_string(),
6504                "each file entry must have lsp_location.file_path string; entry: {entry}"
6505            );
6506        }
6507    }
6508
6509    /// D1: every symbol has a `kind` (u32) and an `lsp_location`.
6510    ///
6511    /// Before D1 symbols were rendered as prose strings like `"function_item func_0"`.
6512    #[test]
6513    fn get_repo_map_each_symbol_has_kind_and_lsp_location() {
6514        let (graph, _) = build_test_graph(3, false);
6515        let response = render_json(&graph, 50, None, false);
6516        for file in &response.files {
6517            for sym in &file.symbols {
6518                assert!(
6519                    sym.kind > 0,
6520                    "symbol kind must be a positive LSP SymbolKind; got 0 for '{}'",
6521                    sym.name
6522                );
6523                assert!(
6524                    !sym.lsp_location.file_path.is_empty(),
6525                    "symbol must have lsp_location.file_path"
6526                );
6527            }
6528        }
6529        // Verify through JSON: kind should be a number.
6530        let json = serde_json::to_string(&response).expect("serialize");
6531        let parsed: serde_json::Value = serde_json::from_str(&json).expect("parse");
6532        for file_entry in parsed["files"].as_array().expect("files") {
6533            for sym_entry in file_entry["symbols"].as_array().expect("symbols") {
6534                assert!(
6535                    sym_entry["kind"].is_number(),
6536                    "symbol `kind` must be a JSON number; sym: {sym_entry}"
6537                );
6538                assert!(
6539                    sym_entry["lsp_location"]["file_path"].is_string(),
6540                    "symbol must have lsp_location.file_path; sym: {sym_entry}"
6541                );
6542            }
6543        }
6544    }
6545
6546    /// D1: `calls` field is an array of `RepoMapCall`-shaped objects (each has
6547    /// `lsp_location` and `rank`).
6548    ///
6549    /// In 4.0.1 calls moved from bare `lsp_location` objects to `RepoMapCall`
6550    /// objects that carry both the target `lsp_location` and the target file's
6551    /// `base_rank`.
6552    #[test]
6553    fn get_repo_map_calls_field_is_array_of_lsp_locations() {
6554        // Build a 5-file star graph so file_0 has non-empty callees.
6555        let (graph, _) = build_test_graph(5, false);
6556        let response = render_json(&graph, 50, None, false);
6557        let json = serde_json::to_string(&response).expect("serialize");
6558        let parsed: serde_json::Value = serde_json::from_str(&json).expect("parse");
6559        for file_entry in parsed["files"].as_array().expect("files") {
6560            let calls = file_entry["calls"]
6561                .as_array()
6562                .expect("calls must be an array");
6563            for call in calls {
6564                // In 4.0.1 each call entry is a RepoMapCall with lsp_location + rank.
6565                assert!(
6566                    call["lsp_location"]["file_path"].is_string(),
6567                    "each call entry must have lsp_location.file_path string; call: {call}"
6568                );
6569                assert!(
6570                    call["rank"].is_number(),
6571                    "each call entry must have a numeric rank; call: {call}"
6572                );
6573            }
6574        }
6575    }
6576
6577    /// D2 / G3: `render_json_budgeted` with a very tight budget returns fewer files.
6578    ///
6579    /// Before the budget allocator, `max_files=3` controlled file count but not
6580    /// per-file expansion. In 4.0.1 the token_budget controls total bytes; with
6581    /// a budget of 1 token (= 4 bytes) only the envelope minimum allows any file
6582    /// at all, and the test verifies that the total_files counter still reflects
6583    /// the full eligible count. `render_json` (compat shim) passes a generous
6584    /// budget; use `render_json_budgeted` with a tight budget to verify the cap.
6585    #[test]
6586    fn get_repo_map_returns_at_most_max_files_files() {
6587        let (graph, _) = build_test_graph(10, false);
6588        // Use render_json_budgeted directly with a tight budget (600 bytes = 3 files
6589        // × 200-byte floor). Each file's envelope minimum is 200 bytes so a 600-byte
6590        // budget should admit at most 3 files.
6591        let response = render_json_budgeted(&graph, 150, None, false);
6592        assert!(
6593            response.files.len() <= 3,
6594            "files.len() = {} must be <= 3 for a 600-byte budget",
6595            response.files.len()
6596        );
6597        assert_eq!(
6598            response.total_files, 10,
6599            "total_files must reflect the full eligible count before budget cap"
6600        );
6601        assert!(
6602            response.capped,
6603            "capped must be true when total_files > files.len()"
6604        );
6605    }
6606
6607    /// D2: `include_metadata=false` (default) excludes JSON/TOML/etc. files.
6608    ///
6609    /// Before D2, JSON files with thousands of repeated keys dominated the
6610    /// output (Issue #5 — JSON-key flooding).
6611    #[test]
6612    fn get_repo_map_excludes_meta_by_default() {
6613        let (graph, _) = build_test_graph(3, /*include_json=*/ true);
6614        // Default: include_metadata = false
6615        let response = render_json(&graph, 50, None, false);
6616        for file in &response.files {
6617            assert!(
6618                !std::path::Path::new(&file.lsp_location.file_path)
6619                    .extension()
6620                    .is_some_and(|e| e.eq_ignore_ascii_case("json")),
6621                "JSON (Meta) files must be excluded when include_metadata=false; found: {}",
6622                file.lsp_location.file_path
6623            );
6624        }
6625    }
6626
6627    /// D2: `include_metadata=true` includes JSON files.
6628    ///
6629    /// Callers who opt-in to metadata should see all content kinds.
6630    #[test]
6631    fn get_repo_map_include_metadata_true_includes_json() {
6632        let (graph, _) = build_test_graph(3, /*include_json=*/ true);
6633        let response = render_json(&graph, 50, None, true);
6634        let has_json = response.files.iter().any(|f| {
6635            std::path::Path::new(&f.lsp_location.file_path)
6636                .extension()
6637                .is_some_and(|e| e.eq_ignore_ascii_case("json"))
6638        });
6639        assert!(
6640            has_json,
6641            "JSON file must be present when include_metadata=true"
6642        );
6643    }
6644
6645    /// J1/J2 MEASUREMENT: flask corpus focus_file=blueprints.py rank dispersion.
6646    ///
6647    /// Mandatory measurement from the 4.0.5 Wave-2 Front-C briefing:
6648    /// - `len(files) >= 8` (not collapsed to just the focus)
6649    /// - focus file rank is the highest in the response
6650    /// - next 5 files all have rank >= 10% of focus rank
6651    /// - neighborhood contains semantically related files (app.py, scaffold.py)
6652    #[test]
6653    #[ignore = "runs on flask corpus at tests/corpus/code/flask; use --ignored --nocapture"]
6654    #[expect(
6655        clippy::too_many_lines,
6656        reason = "end-to-end corpus measurement test; assertion sequence is sequential and cannot be meaningfully split"
6657    )]
6658    fn test_flask_focus_blueprints_rank_dispersion() {
6659        let corpus_root = Path::new(env!("CARGO_MANIFEST_DIR"))
6660            .parent()
6661            .unwrap()
6662            .parent()
6663            .unwrap()
6664            .join("tests/corpus/code/flask");
6665
6666        assert!(
6667            corpus_root.exists(),
6668            "flask corpus not found at {}",
6669            corpus_root.display()
6670        );
6671
6672        let graph = build_graph(&corpus_root).expect("build_graph on flask corpus");
6673        eprintln!("Flask corpus: {} files in graph", graph.files.len());
6674
6675        // Find focus file
6676        let focus_path = "src/flask/blueprints.py";
6677        let focus_idx = graph.files.iter().position(|f| f.path == focus_path);
6678        eprintln!("Focus file '{focus_path}' -> idx: {focus_idx:?}");
6679        assert!(
6680            focus_idx.is_some(),
6681            "blueprints.py not found in graph; available files: {:?}",
6682            graph
6683                .files
6684                .iter()
6685                .map(|f| &f.path)
6686                .take(20)
6687                .collect::<Vec<_>>()
6688        );
6689
6690        let response = render_json_budgeted(&graph, 4000, focus_idx, false);
6691
6692        // Criterion 1: at least 8 files returned.
6693        eprintln!(
6694            "Focused response: {} files (total_files={})",
6695            response.files.len(),
6696            response.total_files
6697        );
6698        assert!(
6699            response.files.len() >= 8,
6700            "expected >= 8 files in focused response; got {} — I#16 winner-take-all collapse",
6701            response.files.len()
6702        );
6703
6704        // Print top 10 for inspection.
6705        eprintln!("\nTop 10 focused files:");
6706        for (i, f) in response.files.iter().take(10).enumerate() {
6707            eprintln!("  [{i}] rank={:.6}  {}", f.rank, f.lsp_location.file_path);
6708        }
6709
6710        // Criterion 2: focus file must appear near the top (top-3) of focused
6711        // results.  With PERSONALIZATION_ALPHA=0.15 and the flask corpus,
6712        // src/flask/app.py has higher structural rank than blueprints.py and
6713        // may legitimately rank #1 — the focus boosts blueprints.py relative
6714        // to its unfocused position, but doesn't guarantee it beats every
6715        // structurally central neighbor.  Being in top-3 confirms the bias
6716        // is working (pre-fix blueprints.py was #1 at 0.703 but that was a
6717        // degenerate collapse; now #1 or #2 is healthy).
6718        let focus_file_rank = response
6719            .files
6720            .iter()
6721            .find(|f| {
6722                f.lsp_location.file_path.contains("blueprints.py")
6723                    && !f.lsp_location.file_path.contains("test_")
6724                    && !f.lsp_location.file_path.contains("sansio")
6725            })
6726            .map(|f| f.rank)
6727            .unwrap_or(0.0);
6728        let focus_position = response
6729            .files
6730            .iter()
6731            .position(|f| {
6732                f.lsp_location.file_path.contains("blueprints.py")
6733                    && !f.lsp_location.file_path.contains("test_")
6734                    && !f.lsp_location.file_path.contains("sansio")
6735            })
6736            .unwrap_or(usize::MAX);
6737        eprintln!(
6738            "\nblueprinets.py position: #{} rank={:.6}",
6739            focus_position + 1,
6740            focus_file_rank
6741        );
6742        assert!(
6743            focus_position < 3,
6744            "blueprints.py must be in top-3 focused results (got position {}); \
6745             soft personalization must rebias toward focus neighborhood — I#16",
6746            focus_position + 1
6747        );
6748
6749        // Criterion 3: next 5 non-focus files have rank >= 10% of the top
6750        // file's rank.  This is the core dispersion check: no more Dirac-delta
6751        // collapse where one file is 0.703 and all others are 0.003.
6752        let top_rank = response.files[0].rank;
6753        let non_focus_min_5 = response
6754            .files
6755            .iter()
6756            .filter(|f| {
6757                !(f.lsp_location.file_path.contains("blueprints.py")
6758                    && !f.lsp_location.file_path.contains("test_")
6759                    && !f.lsp_location.file_path.contains("sansio"))
6760            })
6761            .take(5)
6762            .map(|f| f.rank)
6763            .fold(f32::INFINITY, f32::min);
6764        let pct = non_focus_min_5 / top_rank * 100.0;
6765        eprintln!(
6766            "\nNext-5 (non-focus) min rank: {non_focus_min_5:.6} = {pct:.1}% of top rank {top_rank:.6}"
6767        );
6768        assert!(
6769            pct >= 10.0,
6770            "next-5 non-focus files min rank is {pct:.1}% of top rank (need ≥ 10%); \
6771             files are collapsing to near-zero floor — I#16"
6772        );
6773
6774        // Criterion 4: neighborhood quality — related files present.
6775        let related_names = ["app.py", "scaffold.py", "sansio"];
6776        let found_related: Vec<&str> = related_names
6777            .iter()
6778            .copied()
6779            .filter(|name| {
6780                response
6781                    .files
6782                    .iter()
6783                    .any(|f| f.lsp_location.file_path.contains(name))
6784            })
6785            .collect();
6786        eprintln!("\nNeighborhood quality: found related files: {found_related:?}");
6787        // At least one related file should appear (soft assertion — log if missing).
6788        if found_related.is_empty() {
6789            eprintln!(
6790                "WARNING: no expected related files (app.py, scaffold.py) found in neighborhood"
6791            );
6792        }
6793    }
6794
6795    #[test]
6796    #[ignore = "runs on full ripvec codebase; use --nocapture to see output"]
6797    fn test_full_repo_map() {
6798        use std::time::Instant;
6799
6800        let root = Path::new(env!("CARGO_MANIFEST_DIR"))
6801            .parent()
6802            .unwrap()
6803            .parent()
6804            .unwrap();
6805
6806        // Phase 1: build_graph (walk + parse + import resolve + PageRank)
6807        let t0 = Instant::now();
6808        let graph = build_graph(root).expect("build_graph on ripvec root");
6809        let build_ms = t0.elapsed().as_secs_f64() * 1000.0;
6810
6811        // Phase 2: render (default, no focus)
6812        let t1 = Instant::now();
6813        let rendered = render(&graph, 2000, None);
6814        let render_ms = t1.elapsed().as_secs_f64() * 1000.0;
6815
6816        // Phase 3: render (topic-sensitive, focused on highest-ranked file)
6817        let t2 = Instant::now();
6818        let focus_idx = graph
6819            .base_ranks
6820            .iter()
6821            .enumerate()
6822            .max_by(|a, b| a.1.total_cmp(b.1))
6823            .map(|(i, _)| i);
6824        let focused = render(&graph, 2000, focus_idx);
6825        let focus_ms = t2.elapsed().as_secs_f64() * 1000.0;
6826
6827        eprintln!("\n=== Repo Map Performance ===");
6828        eprintln!(
6829            "Files: {}, Edges: {}, Defs: {}",
6830            graph.files.len(),
6831            graph.edges.len(),
6832            graph.files.iter().map(|f| f.defs.len()).sum::<usize>()
6833        );
6834        eprintln!("build_graph:     {build_ms:.1}ms (walk + parse + resolve + PageRank)");
6835        eprintln!(
6836            "render(default): {render_ms:.3}ms ({} chars, ~{} tokens)",
6837            rendered.len(),
6838            rendered.len() / 4
6839        );
6840        eprintln!(
6841            "render(focused): {focus_ms:.3}ms ({} chars, ~{} tokens)",
6842            focused.len(),
6843            focused.len() / 4
6844        );
6845
6846        eprintln!("\nTop 5 by PageRank:");
6847        let mut ranked: Vec<(usize, f32)> = graph.base_ranks.iter().copied().enumerate().collect();
6848        ranked.sort_by(|a, b| b.1.total_cmp(&a.1));
6849        for (i, rank) in ranked.iter().take(5) {
6850            eprintln!("  {:.4} {}", rank, graph.files[*i].path);
6851        }
6852
6853        eprintln!("\n=== Default Render ===\n{rendered}");
6854        eprintln!(
6855            "\n=== Focused Render (on {}) ===\n{focused}",
6856            focus_idx
6857                .map(|i| graph.files[i].path.as_str())
6858                .unwrap_or("none")
6859        );
6860    }
6861
6862    // ── C1/C2 Tests (4.1.1): Python decorator-aware kind classification ──
6863
6864    /// `test:chunker_stores_property_decorator_kind` — a Python `@property`-
6865    /// decorated function produces a [`Definition`] with
6866    /// `decorator = Some("property")` and `lsp_kind_hint = Some(7)`.
6867    ///
6868    /// Behavior: trigger-fails-on-baseline-then-passes-post-fix.
6869    /// Baseline: `Definition` had no `decorator` / `lsp_kind_hint` fields.
6870    /// Post-fix: `extract_definitions` populates both from the AST.
6871    #[test]
6872    fn test_chunker_stores_property_decorator_kind() {
6873        let source = "@property\ndef foo(self):\n    return self._x\n";
6874        let lang_cfg = crate::languages::config_for_extension("py").expect("python lang config");
6875        let defs = extract_definitions(source, &lang_cfg);
6876
6877        let foo = defs
6878            .iter()
6879            .find(|d| d.name == "foo")
6880            .expect("expected def 'foo'");
6881        assert_eq!(
6882            foo.decorator.as_deref(),
6883            Some("property"),
6884            "decorator must be Some(\"property\") for @property def; got {:?}",
6885            foo.decorator
6886        );
6887        assert_eq!(
6888            foo.lsp_kind_hint,
6889            Some(7),
6890            "lsp_kind_hint must be Some(7) (Property) for @property def; got {:?}",
6891            foo.lsp_kind_hint
6892        );
6893    }
6894
6895    /// `test:chunker_stores_classmethod_decorator_kind` — a Python `@classmethod`-
6896    /// decorated function produces a [`Definition`] with
6897    /// `decorator = Some("classmethod")` and `lsp_kind_hint = Some(12)`.
6898    #[test]
6899    fn test_chunker_stores_classmethod_decorator_kind() {
6900        let source = "@classmethod\ndef from_dict(cls, d):\n    pass\n";
6901        let lang_cfg = crate::languages::config_for_extension("py").expect("python lang config");
6902        let defs = extract_definitions(source, &lang_cfg);
6903
6904        let from_dict = defs
6905            .iter()
6906            .find(|d| d.name == "from_dict")
6907            .expect("expected def 'from_dict'");
6908        assert_eq!(
6909            from_dict.decorator.as_deref(),
6910            Some("classmethod"),
6911            "decorator must be Some(\"classmethod\"); got {:?}",
6912            from_dict.decorator
6913        );
6914        assert_eq!(
6915            from_dict.lsp_kind_hint,
6916            Some(12),
6917            "lsp_kind_hint must be Some(12) (Function) for @classmethod def; got {:?}",
6918            from_dict.lsp_kind_hint
6919        );
6920    }
6921
6922    /// `test:chunker_stores_arbitrary_decorator_name` — a Python
6923    /// `@functools.lru_cache`-decorated function (attribute-access decorator)
6924    /// produces a [`Definition`] with `decorator = Some("functools.lru_cache")`
6925    /// and `lsp_kind_hint = Some(12)`.
6926    #[test]
6927    fn test_chunker_stores_arbitrary_decorator_name() {
6928        let source = "@functools.lru_cache\ndef expensive(n):\n    pass\n";
6929        let lang_cfg = crate::languages::config_for_extension("py").expect("python lang config");
6930        let defs = extract_definitions(source, &lang_cfg);
6931
6932        let expensive = defs
6933            .iter()
6934            .find(|d| d.name == "expensive")
6935            .expect("expected def 'expensive'");
6936        // Attribute decorators are stored as dotted-name text.
6937        assert_eq!(
6938            expensive.decorator.as_deref(),
6939            Some("functools.lru_cache"),
6940            "decorator must be Some(\"functools.lru_cache\"); got {:?}",
6941            expensive.decorator
6942        );
6943        assert_eq!(
6944            expensive.lsp_kind_hint,
6945            Some(12),
6946            "lsp_kind_hint must be Some(12) (Function) for @functools.lru_cache def; got {:?}",
6947            expensive.lsp_kind_hint
6948        );
6949    }
6950
6951    /// `test:repo_map_projection_uses_stored_kind_for_python_decorator` — the
6952    /// `render_json_budgeted` projection site reads `lsp_kind_hint` from the
6953    /// stored [`Definition`] rather than re-computing via `lsp_symbol_kind_for_node_kind`.
6954    ///
6955    /// A synthetic [`RepoGraph`] with a Definition whose `kind = "decorated_definition"`
6956    /// (which would map to Property=7 via the AST-less path) but
6957    /// `lsp_kind_hint = Some(12)` must produce a [`RepoMapSymbol`] with `kind = 12`.
6958    #[test]
6959    fn test_repo_map_projection_uses_stored_kind_for_python_decorator() {
6960        // Build a synthetic graph: one file, one @classmethod-decorated function.
6961        // kind = "decorated_definition" (AST-less → 7 = Property, WRONG).
6962        // lsp_kind_hint = Some(12) (stored at parse time, CORRECT).
6963        let files = vec![FileNode {
6964            path: "module.py".to_string(),
6965            defs: vec![Definition {
6966                name: "from_dict".to_string(),
6967                kind: "decorated_definition".to_string(),
6968                start_line: 1,
6969                end_line: 3,
6970                scope: String::new(),
6971                signature: None,
6972                start_byte: 0,
6973                end_byte: 60,
6974                calls: vec![],
6975                decorator: Some("classmethod".to_string()),
6976                lsp_kind_hint: Some(12),
6977            }],
6978            imports: vec![],
6979        }];
6980        let graph = build_graph_from_files_pub(files);
6981        let result = render_json_budgeted(&graph, 4000, None, false);
6982
6983        // Locate the symbol in the result.
6984        let file = result
6985            .files
6986            .iter()
6987            .find(|f| f.lsp_location.file_path.contains("module.py"))
6988            .expect("module.py must appear in render output");
6989        let sym = file
6990            .symbols
6991            .iter()
6992            .find(|s| s.name == "from_dict")
6993            .expect("from_dict must appear as a symbol");
6994        assert_eq!(
6995            sym.kind, 12,
6996            "C2: lsp_kind_hint=12 must override the AST-less kind (7 for decorated_definition); got {}",
6997            sym.kind
6998        );
6999    }
7000}