ripvec_core/
repo_map.rs

1//! `PageRank`-weighted structural overview of a codebase.
2//!
3//! Builds a dependency graph from tree-sitter definition and import extraction,
4//! ranks files by importance using `PageRank` (standard or topic-sensitive), and
5//! renders a budget-constrained overview with tiered detail levels.
6
7use std::collections::{HashMap, HashSet};
8use std::fmt::Write as _;
9use std::path::{Path, PathBuf};
10
11use rayon::prelude::*;
12use rkyv::{Archive, Deserialize as RkyvDeserialize, Serialize as RkyvSerialize};
13use streaming_iterator::StreamingIterator;
14use tree_sitter::{Parser, Query, QueryCursor};
15
16use serde::{Deserialize, Serialize};
17
18use crate::chunk::ContentKind;
19use crate::languages;
20use crate::walk;
21
22/// Serialize a `ContentKind` to a lowercase string tag for JSON output.
23fn content_kind_tag(ck: ContentKind) -> &'static str {
24    match ck {
25        ContentKind::Code => "code",
26        ContentKind::Docs => "docs",
27        ContentKind::Meta => "meta",
28    }
29}
30
31// ── Data Structures ──────────────────────────────────────────────────
32
33/// Persisted dependency graph with `PageRank` scores.
34#[derive(Debug, Clone, Archive, RkyvSerialize, RkyvDeserialize)]
35pub struct RepoGraph {
36    /// Files in the repository with definitions, imports, and calls.
37    pub files: Vec<FileNode>,
38    /// File-level edges (derived from def-level call edges).
39    pub edges: Vec<(u32, u32, u32)>,
40    /// File-level `PageRank` scores (aggregated from def-level).
41    pub base_ranks: Vec<f32>,
42    /// File-level callers (indices into `files`).
43    pub callers: Vec<Vec<u32>>,
44    /// File-level callees (indices into `files`).
45    pub callees: Vec<Vec<u32>>,
46    /// Definition-level call edges: `(caller_def, callee_def, weight)`.
47    pub def_edges: Vec<(DefId, DefId, u32)>,
48    /// Definition-level `PageRank` scores (flattened: `offsets[file_idx] + def_idx`).
49    pub def_ranks: Vec<f32>,
50    /// Definition-level callers (flattened, parallel to `def_ranks`).
51    pub def_callers: Vec<Vec<DefId>>,
52    /// Definition-level callees (flattened, parallel to `def_ranks`).
53    pub def_callees: Vec<Vec<DefId>>,
54    /// Prefix-sum offsets for flattening `DefId` to linear index.
55    pub def_offsets: Vec<usize>,
56    /// Auto-tuned alpha for search boost.
57    pub alpha: f32,
58}
59
60/// A file in the repository with its definitions and imports.
61#[derive(Debug, Clone, Archive, RkyvSerialize, RkyvDeserialize)]
62pub struct FileNode {
63    /// Relative path from the repository root.
64    pub path: String,
65    /// Definitions (functions, structs, classes, etc.) extracted from this file.
66    pub defs: Vec<Definition>,
67    /// Import references extracted from this file.
68    pub imports: Vec<ImportRef>,
69}
70
71/// A definition extracted from a source file.
72#[derive(Debug, Clone, Default, Archive, RkyvSerialize, RkyvDeserialize)]
73pub struct Definition {
74    /// Name of the definition (e.g., function name, class name).
75    pub name: String,
76    /// Kind of syntax node (e.g., `function_item`, `class_definition`).
77    pub kind: String,
78    /// 1-based start line number.
79    pub start_line: u32,
80    /// 1-based end line number.
81    pub end_line: u32,
82    /// Scope chain (e.g., `"impl_item Foo > fn bar"`).
83    pub scope: String,
84    /// Function/method signature, if available.
85    pub signature: Option<String>,
86    /// Byte offset of this definition's start in the source file.
87    pub start_byte: u32,
88    /// Byte offset of this definition's end in the source file.
89    pub end_byte: u32,
90    /// Call sites within this definition's body.
91    pub calls: Vec<CallRef>,
92    /// The first decorator name for Python `decorated_definition` nodes (e.g.,
93    /// `"property"`, `"classmethod"`, `"staticmethod"`, `"cached_property"`).
94    ///
95    /// `None` for all non-Python definitions and for bare (undecorated) Python
96    /// functions and classes. Populated by `extract_definitions` at AST-parse
97    /// time with full tree-sitter access (C1, 4.1.1).
98    pub decorator: Option<String>,
99    /// Decorator-aware LSP SymbolKind integer (e.g., 7=Property, 12=Function).
100    ///
101    /// Computed at parse time when the AST is available so projection sites
102    /// (`render_json_budgeted`) do not need to re-parse. For Python `@property`
103    /// or `@cached_property` → 7. For `@classmethod`, `@staticmethod`, or any
104    /// other decorator → 12. `None` for all non-decorated definitions; callers
105    /// fall back to `lsp_symbol_kind_for_node_kind(&self.kind)` when `None`
106    /// (C1/C2, 4.1.1).
107    pub lsp_kind_hint: Option<u32>,
108}
109
110/// An import reference extracted from a source file.
111#[derive(Debug, Clone, Archive, RkyvSerialize, RkyvDeserialize)]
112pub struct ImportRef {
113    /// Raw import path as written in source (e.g., `crate::foo::bar`).
114    pub raw_path: String,
115    /// Resolved file index in [`RepoGraph::files`], if resolution succeeded.
116    pub resolved_idx: Option<u32>,
117}
118
119/// Unique identifier for a definition: (file index, definition index within file).
120pub type DefId = (u32, u16);
121
122/// A call site extracted from a definition body.
123#[derive(Debug, Clone, Default, Archive, RkyvSerialize, RkyvDeserialize)]
124pub struct CallRef {
125    /// Callee function/method name (bare, without qualifier).
126    ///
127    /// For scoped calls like `mod_a::foo()`, this is `"foo"`.
128    /// For bare calls like `foo()`, this is `"foo"`.
129    pub name: String,
130    /// Full qualified path for scoped calls, e.g. `Some("mod_a::foo")`.
131    ///
132    /// `None` for bare (unqualified) calls. When `Some`, `resolve_calls`
133    /// uses this for qualifier-based module disambiguation before falling
134    /// back to the bare `name`.
135    pub qualified_path: Option<String>,
136    /// Receiver type for method calls, inferred from local context.
137    ///
138    /// Set to `Some("Foo")` when:
139    /// - The call is `self.method()` inside `impl Foo { … }`.
140    /// - The call is `x.method()` where `x` has an explicit type annotation `x: Foo`.
141    /// - The call is `x.method()` after `let x = Foo::new()`.
142    ///
143    /// `None` for free function calls, or when the receiver type cannot be
144    /// inferred from local context alone. When `Some`, `resolve_calls` prefers
145    /// defs whose enclosing impl scope matches the receiver type.
146    pub receiver_type: Option<String>,
147    /// Byte offset of the call in the source file (for scoping to definitions).
148    pub byte_offset: u32,
149    /// Resolved target definition, if resolution succeeded.
150    pub resolved: Option<DefId>,
151}
152
153// ── JSON output types ────────────────────────────────────────────────
154
155/// LSP-shaped location pointing at a file or symbol within a file.
156///
157/// Lines and characters are 0-based, matching the Language Server Protocol
158/// convention so callers can pass this directly to LSP tools without any
159/// conversion.
160#[derive(Debug, Clone, Serialize)]
161pub struct RepoMapLspLocation {
162    /// Relative path from the repository root (prefixed with `./`).
163    pub file_path: String,
164    /// 0-based start line.
165    pub start_line: usize,
166    /// 0-based start character (0 for file-level locations).
167    pub start_character: usize,
168    /// 0-based end line (equals `start_line` for file-level locations).
169    pub end_line: usize,
170    /// 0-based end character (0 for file-level locations).
171    pub end_character: usize,
172}
173
174/// A top-level symbol extracted from a file in the repository map.
175///
176/// Analogous to an LSP `DocumentSymbol` but limited to the fields available
177/// from tree-sitter definition extraction. The `rank` field carries the
178/// definition-level `PageRank` score from [`RepoGraph::def_ranks`], enabling
179/// callers to prioritise symbols by structural importance.
180#[derive(Debug, Clone, Serialize)]
181pub struct RepoMapSymbol {
182    /// Symbol name (function name, struct name, etc.).
183    pub name: String,
184    /// LSP `SymbolKind` as a decimal — use the same values as
185    /// `lsp_workspace_symbols` and `lsp_document_symbols`.
186    pub kind: u32,
187    /// Location pointing at the symbol's definition line (0-based).
188    pub lsp_location: RepoMapLspLocation,
189    /// Definition-level `PageRank` score from [`RepoGraph::def_ranks`].
190    ///
191    /// Higher values indicate definitions that are called by many other
192    /// definitions. Used by the token-budget allocator to decide which
193    /// symbols to include when the per-file budget is constrained.
194    pub rank: f32,
195}
196
197/// An outgoing call-edge from a file to another file.
198///
199/// Carries both the target file's `lsp_location` and its `base_rank`
200/// (file-level `PageRank` score) so callers can decide how important
201/// each dependency is without a separate lookup.
202#[derive(Debug, Clone, Serialize)]
203pub struct RepoMapCall {
204    /// Location pointing at the target file (line 0, character 0).
205    pub lsp_location: RepoMapLspLocation,
206    /// File-level `PageRank` score of the target file.
207    pub rank: f32,
208}
209
210/// One file entry in the JSON repo map.
211///
212/// Carries the file's `PageRank` score, content kind, outgoing call-edges to
213/// other files, and the file's top-level symbol definitions — all with
214/// `lsp_location` so the caller can chain directly into LSP tools without
215/// any destructuring.
216#[derive(Debug, Clone, Serialize)]
217pub struct RepoMapFile {
218    /// Location pointing at the file itself (line 0, character 0).
219    ///
220    /// Pass `lsp_location.file_path` directly into `lsp_document_symbols` or
221    /// any other file-scoped tool.
222    pub lsp_location: RepoMapLspLocation,
223    /// `PageRank` score in [0, 1] (higher = more structurally central).
224    pub rank: f32,
225    /// Content classification: `"code"`, `"docs"`, or `"meta"`.
226    ///
227    /// Serialized as a lowercase string tag so JSON consumers can branch
228    /// without numeric magic values. Mirrors the `ContentKind` enum in
229    /// `ripvec-core::chunk`.
230    pub content_kind: &'static str,
231    /// Outgoing call-edges sorted by target file `PageRank` descending.
232    pub calls: Vec<RepoMapCall>,
233    /// Top-level definitions extracted from this file by tree-sitter,
234    /// sorted by definition-level `PageRank` descending and pruned to
235    /// the per-file token-budget allocation.
236    pub symbols: Vec<RepoMapSymbol>,
237    /// Number of symbols that were omitted due to budget exhaustion or
238    /// logarithmic attenuation cutoff. `truncated_symbols + symbols.len()`
239    /// equals the total definition count for the file.
240    pub truncated_symbols: usize,
241    /// Number of call-edges that were omitted due to the `MAX_FILE_CALLS`
242    /// render cap or byte-budget exhaustion. `truncated_calls + calls.len()`
243    /// equals the total callee count for the file (I#68, 4.1.4).
244    pub truncated_calls: usize,
245}
246
247/// JSON-mode response envelope for `get_repo_map` (4.0.1 shape).
248///
249/// Replaces the `max_files`-capped shape from 4.0.0. The caller supplies a
250/// `token_budget`; files are allocated bytes proportional to their `PageRank`
251/// (40% cap per file, 200-byte envelope floor). Symbols are filled in
252/// def-rank order with a logarithmic attenuation cutoff. Leftover bytes
253/// cascade to subsequent files.
254///
255/// The `estimated_bytes`, `budget_bytes`, and `budget_exhausted` fields give
256/// callers real-time feedback on how tightly the budget was consumed.
257#[derive(Debug, Clone, Serialize)]
258pub struct GetRepoMapResponse {
259    /// Files sorted by `PageRank` descending, pruned to the token budget.
260    pub files: Vec<RepoMapFile>,
261    /// Total number of eligible files in the graph (pre-allocation).
262    ///
263    /// If `total_files > files.len()`, the budget ran out before all files
264    /// could be included. Read `budget_exhausted` directly for the boolean.
265    pub total_files: usize,
266    /// Actual serialised-JSON byte count for all returned content.
267    pub estimated_bytes: usize,
268    /// Budget ceiling in bytes that was used for allocation
269    /// (`token_budget * 4`).
270    pub budget_bytes: usize,
271    /// `true` when `total_files > files.len()` (budget was exhausted before
272    /// all eligible files were included).
273    pub budget_exhausted: bool,
274    /// Retained for backward compatibility with 4.0.0 callers that checked
275    /// `capped`. Equivalent to `budget_exhausted`.
276    pub capped: bool,
277}
278
279// ── Constants ────────────────────────────────────────────────────────
280
281/// `PageRank` damping factor.
282const DAMPING: f32 = 0.85;
283
284/// `PageRank` convergence threshold.
285const EPSILON: f32 = 1e-6;
286
287/// Maximum `PageRank` iterations.
288const MAX_ITERATIONS: usize = 100;
289
290/// Maximum callers/callees stored per file (display-oriented neighbor lists).
291///
292/// Raised from 5 → 25 in 4.1.3 (I#60): hub functions in real Rust/Python/Go
293/// corpora commonly have 10-25 outgoing call edges; the old cap of 5 caused
294/// every large-scale corpus file to report `truncated_calls ≥ 2` and left
295/// agents reading `get_repo_map.files[i].calls[]` with a sparse skeleton.
296///
297/// The serialisation-size cost is bounded by the existing `token_budget`
298/// allocator, which gates each file's call list against its byte allocation.
299/// BFS-internal reachability (`compute_dead_code`) uses the untruncated
300/// `def_edges` CSR directly (I#57/I#61) and is NOT affected by this constant.
301const MAX_NEIGHBORS: usize = 25;
302
303/// Maximum number of outgoing call entries rendered per file in the JSON response.
304///
305/// Applied at render time in [`render_json_budgeted`] as a hard cap on
306/// `calls[]` length (I#68, 4.1.4). Symmetric with the graph-build cap
307/// [`MAX_NEIGHBORS`] so agents always see up to 25 callees — matching the
308/// def-level cap introduced in I#60.
309///
310/// Replaces the logarithmic attenuation cutoff that was previously applied to
311/// file-level callees. Attenuation is appropriate for *symbol* lists (where
312/// rank distributions are informative) but pathological for *call-edge* lists:
313/// in real corpora callee base-ranks follow a geometric distribution, causing
314/// the attenuation to fire at pos=1 and collapse `calls[]` to a single entry.
315/// The byte-budget check is retained; this constant adds a count ceiling.
316const MAX_FILE_CALLS: usize = 25;
317
318/// Approximate characters per token for budget estimation.
319const CHARS_PER_TOKEN: usize = 4;
320
321/// Concentration mass placed on the focus node in topic-sensitive `PageRank`.
322///
323/// Following Haveliwala 2002 ("Topic-Sensitive PageRank"), the personalization
324/// vector places a bias `α` on the focus node and distributes the remaining
325/// `1 - α` uniformly over all other nodes. This preserves rank dispersion
326/// across the corpus — the user sees a *neighborhood* of related files
327/// rebiased toward the focus, not a Dirac delta on the focus node with
328/// every other file collapsed to an equally negligible uniform floor.
329///
330/// Value 0.35 means:
331///   - focus node teleportation probability = 0.35
332///   - each of the (n - 1) other nodes = 0.65 / (n - 1)
333///
334/// Calibration history:
335/// - Pre-4.0.5: α = 0.70 → winner-take-all collapse (flask focus = 0.703,
336///   all others ≈ 0.003); fixed under I#16.
337/// - 4.0.5 → 4.1.11: α = 0.15 → preserved dispersion but underbiased on
338///   real corpora: flask focus blueprints.py landed at position #5-#7
339///   instead of top-3 because structural hubs (helpers.py, app.py)
340///   dominated.
341/// - 4.1.12+: α = 0.35 → focus reliably surfaces in top-3 on flask
342///   (rank ~0.10 vs hub rank ~0.10), small-graph dispersion tests still
343///   pass (n=10 star dispersion ratio remains under the 40× ceiling),
344///   J2 file-count parity relaxed from 80% to 70% as the unavoidable
345///   trade-off for stronger focus bias.
346const PERSONALIZATION_ALPHA: f32 = 0.35;
347
348// ── Import Queries ───────────────────────────────────────────────────
349
350/// Compile a tree-sitter import query for the given extension.
351///
352/// Returns `None` for unsupported extensions.
353fn import_query_for_extension(ext: &str) -> Option<(tree_sitter::Language, Query)> {
354    let (lang, query_str): (tree_sitter::Language, &str) = match ext {
355        "rs" => (
356            tree_sitter_rust::LANGUAGE.into(),
357            "(use_declaration) @import",
358        ),
359        "py" | "pyi" => (
360            tree_sitter_python::LANGUAGE.into(),
361            concat!(
362                "(import_statement) @import\n",
363                "(import_from_statement) @import",
364            ),
365        ),
366        "js" | "jsx" => (
367            tree_sitter_javascript::LANGUAGE.into(),
368            "(import_statement source: (string) @import_path) @import",
369        ),
370        "ts" => (
371            tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into(),
372            "(import_statement source: (string) @import_path) @import",
373        ),
374        "tsx" => (
375            tree_sitter_typescript::LANGUAGE_TSX.into(),
376            "(import_statement source: (string) @import_path) @import",
377        ),
378        "go" => (
379            tree_sitter_go::LANGUAGE.into(),
380            "(import_spec path: (interpreted_string_literal) @import_path) @import",
381        ),
382        // Ruby: require statements.
383        "rb" => (
384            tree_sitter_ruby::LANGUAGE.into(),
385            "(call method: (identifier) @_method arguments: (argument_list (string (string_content) @import_path)) (#eq? @_method \"require\")) @import",
386        ),
387        _ => return None,
388    };
389    let query = match Query::new(&lang, query_str) {
390        Ok(q) => q,
391        Err(e) => {
392            tracing::warn!(ext, %e, "import query compilation failed — language may be ABI-incompatible");
393            return None;
394        }
395    };
396    Some((lang, query))
397}
398
399/// Extract import paths from source using tree-sitter.
400fn extract_imports(
401    source: &str,
402    lang: &tree_sitter::Language,
403    import_query: &Query,
404) -> Vec<String> {
405    let mut parser = Parser::new();
406    if parser.set_language(lang).is_err() {
407        return vec![];
408    }
409    let Some(tree) = parser.parse(source, None) else {
410        return vec![];
411    };
412
413    let mut cursor = QueryCursor::new();
414    let mut imports = Vec::new();
415    let mut matches = cursor.matches(import_query, tree.root_node(), source.as_bytes());
416
417    while let Some(m) = matches.next() {
418        // Prefer @import_path capture (JS/TS/Go), fall back to full @import text
419        let mut import_path_text = None;
420        let mut import_text = None;
421
422        for cap in m.captures {
423            let cap_name = &import_query.capture_names()[cap.index as usize];
424            let text = &source[cap.node.start_byte()..cap.node.end_byte()];
425            if *cap_name == "import_path" {
426                import_path_text = Some(text.trim_matches(|c| c == '"' || c == '\''));
427            } else if *cap_name == "import" {
428                import_text = Some(text);
429            }
430        }
431
432        if let Some(path) = import_path_text {
433            imports.push(path.to_string());
434        } else if let Some(text) = import_text {
435            imports.push(text.to_string());
436        }
437    }
438
439    imports
440}
441
442// ── Import Resolution ────────────────────────────────────────────────
443
444/// Resolve a Rust `use` path to a file index in the file map.
445///
446/// Handles `crate::`, `self::`, and `super::` prefixes. External crate
447/// imports are dropped (returns `None`).
448fn resolve_rust_import(
449    raw: &str,
450    file_path: &Path,
451    root: &Path,
452    file_index: &HashMap<PathBuf, usize>,
453) -> Option<usize> {
454    // Extract the module path from `use crate::foo::bar;` or `use crate::foo::bar::Baz;`
455    let trimmed = raw
456        .trim()
457        .trim_start_matches("use ")
458        .trim_end_matches(';')
459        .trim();
460
461    let segments: Vec<&str> = trimmed.split("::").collect();
462    if segments.is_empty() {
463        return None;
464    }
465
466    // Determine the base directory and skip prefix segments
467    let (base, skip) = match segments[0] {
468        "crate" => {
469            // Find the nearest Cargo.toml ancestor to determine the crate root.
470            // In a workspace, `crate::foo` resolves relative to the crate's src/,
471            // not the workspace root.
472            let mut dir = file_path.parent();
473            let crate_root = loop {
474                match dir {
475                    Some(d) if d.join("Cargo.toml").exists() => break d.join("src"),
476                    Some(d) => dir = d.parent(),
477                    None => break root.join("src"), // fallback
478                }
479            };
480            (crate_root, 1)
481        }
482        "self" => {
483            let dir = file_path.parent()?;
484            (dir.to_path_buf(), 1)
485        }
486        "super" => {
487            let dir = file_path.parent()?.parent()?;
488            (dir.to_path_buf(), 1)
489        }
490        // External crate — drop
491        _ => return None,
492    };
493
494    // Build candidate paths from the remaining segments.
495    // Try progressively shorter prefixes since the last segments
496    // may be items (struct, fn) rather than modules.
497    let path_segments = &segments[skip..];
498    for end in (1..=path_segments.len()).rev() {
499        let mut candidate = base.clone();
500        for seg in &path_segments[..end] {
501            // Strip glob patterns like `{Foo, Bar}`
502            let clean = seg.split('{').next().unwrap_or(seg).trim();
503            if !clean.is_empty() {
504                candidate.push(clean);
505            }
506        }
507
508        // Try file.rs
509        let as_file = candidate.with_extension("rs");
510        if let Some(&idx) = file_index.get(&as_file) {
511            return Some(idx);
512        }
513
514        // Try dir/mod.rs
515        let as_mod = candidate.join("mod.rs");
516        if let Some(&idx) = file_index.get(&as_mod) {
517            return Some(idx);
518        }
519    }
520
521    None
522}
523
524/// Resolve an import path to a file index based on file extension.
525fn resolve_import(
526    raw: &str,
527    ext: &str,
528    file_path: &Path,
529    root: &Path,
530    file_index: &HashMap<PathBuf, usize>,
531) -> Option<usize> {
532    match ext {
533        "rs" => resolve_rust_import(raw, file_path, root, file_index),
534        "py" | "pyi" => resolve_python_import(raw, root, file_index),
535        "js" | "jsx" | "ts" | "tsx" => resolve_js_import(raw, file_path, file_index),
536        // Go imports use full package paths — skip local resolution
537        _ => None,
538    }
539}
540
541/// Resolve a Python import to a file index.
542///
543/// Handles `import foo.bar` and `from foo.bar import baz` patterns.
544fn resolve_python_import(
545    raw: &str,
546    root: &Path,
547    file_index: &HashMap<PathBuf, usize>,
548) -> Option<usize> {
549    let module_path = if let Some(rest) = raw.strip_prefix("from ") {
550        rest.split_whitespace().next()?
551    } else if let Some(rest) = raw.strip_prefix("import ") {
552        rest.split_whitespace().next()?
553    } else {
554        return None;
555    };
556
557    let rel_path: PathBuf = module_path.split('.').collect();
558    for ext in ["py", "pyi"] {
559        let as_file = root.join(&rel_path).with_extension(ext);
560        if let Some(&idx) = file_index.get(&as_file) {
561            return Some(idx);
562        }
563    }
564
565    for init_name in ["__init__.py", "__init__.pyi"] {
566        let as_init = root.join(&rel_path).join(init_name);
567        if let Some(&idx) = file_index.get(&as_init) {
568            return Some(idx);
569        }
570    }
571
572    None
573}
574
575/// Resolve a JS/TS import to a file index.
576///
577/// Handles relative paths like `./foo` or `../bar`.
578fn resolve_js_import(
579    raw: &str,
580    file_path: &Path,
581    file_index: &HashMap<PathBuf, usize>,
582) -> Option<usize> {
583    if !raw.starts_with('.') {
584        return None;
585    }
586
587    let dir = file_path.parent()?;
588    let candidate = dir.join(raw);
589
590    for ext in &["js", "jsx", "ts", "tsx"] {
591        let with_ext = candidate.with_extension(ext);
592        if let Some(&idx) = file_index.get(&with_ext) {
593            return Some(idx);
594        }
595    }
596
597    for ext in &["js", "jsx", "ts", "tsx"] {
598        let index_file = candidate.join("index").with_extension(ext);
599        if let Some(&idx) = file_index.get(&index_file) {
600            return Some(idx);
601        }
602    }
603
604    None
605}
606
607// ── Extraction ───────────────────────────────────────────────────────
608
609/// Extract the name of the first decorator from a `decorated_definition` node.
610///
611/// Mirrors the private `languages::first_decorator_ident` but inlined here
612/// because that function is private to the `languages` module.
613///
614/// For simple `@name` decorators (e.g., `@property`, `@classmethod`), returns
615/// `Some("property")` or `Some("classmethod")`.
616///
617/// For attribute-access decorators (e.g., `@functools.lru_cache`), returns
618/// `Some("functools.lru_cache")` — the full dotted-name text.
619///
620/// For call-expression decorators (e.g., `@app.route("/")`), returns `None`.
621fn extract_first_decorator_name(node: &tree_sitter::Node<'_>, source: &[u8]) -> Option<String> {
622    let mut cursor = node.walk();
623    for child in node.children(&mut cursor) {
624        if child.kind() == "decorator" {
625            let mut inner = child.walk();
626            for inner_child in child.children(&mut inner) {
627                match inner_child.kind() {
628                    // Simple name (@property) or attribute access (@functools.lru_cache) —
629                    // return the full text so callers can store it for display.
630                    "identifier" | "attribute" => {
631                        return std::str::from_utf8(
632                            &source[inner_child.start_byte()..inner_child.end_byte()],
633                        )
634                        .ok()
635                        .map(str::to_owned);
636                    }
637                    // Call expression — ambiguous; treat as None.
638                    "call" => return None,
639                    _ => {}
640                }
641            }
642            return None;
643        }
644    }
645    None
646}
647
648/// Determine whether a Python `(assignment)` or JS/TS `(variable_declarator)`
649/// def captured by the def-query is spuriously nested inside a function body.
650///
651/// Cycle 10 W1 Front A — the Python def-query at `languages.rs:637` and the
652/// JS/TS variants at `:647`, `:657`, `:669` capture every `x = foo()` or
653/// `const x = foo()` site, even those inside function bodies, as defs. The
654/// `extract_calls` smallest-enclosing rule (`repo_map.rs:827-832`) then steals
655/// every call out of the surrounding function into the nested assignment def.
656/// Result: BFS terminates early because function defs have no outgoing edges.
657///
658/// Fix: walk up the AST from the def node. If we hit a function-body context
659/// (Python `function_definition`, JS/TS `function_declaration`,
660/// `function_expression`, `arrow_function`, `method_definition`,
661/// `generator_function_declaration`, `generator_function`) before reaching the
662/// module root, the def is spurious and must be dropped.
663///
664/// Module-level constants (`MAX = 4096`) and class-attribute assignments
665/// (`class Foo: bar = make_bar()`) — both of which are legitimate defs — are
666/// preserved because their ancestor chain contains only `module` /
667/// `class_definition` / `class_body` / `block` / `program` nodes, never a
668/// function-body container.
669fn is_spurious_nested_binding_def(kind: &str, node: tree_sitter::Node<'_>) -> bool {
670    // Only the assignment / variable_declarator captures are at risk.
671    // Other def kinds (function_definition, class_definition, method_definition,
672    // function_declaration, class_declaration, type_alias_declaration, etc.)
673    // are always legitimate at any nesting depth (e.g. nested functions,
674    // methods inside classes).
675    if !matches!(kind, "assignment" | "variable_declarator") {
676        return false;
677    }
678    let mut cur = node.parent();
679    while let Some(parent) = cur {
680        match parent.kind() {
681            // Function-body containers: any def captured beneath one of these
682            // is a local-variable binding, not a module/class-level def.
683            "function_definition"           // Python
684            | "function_declaration"        // JS / TS
685            | "function_expression"         // JS / TS
686            | "arrow_function"              // JS / TS
687            | "method_definition"           // JS / TS
688            | "generator_function_declaration"
689            | "generator_function" => return true,
690            // Top-level containers — we reached the module without finding a
691            // function ancestor. The def is legitimate.
692            "module" | "program" => return false,
693            _ => {}
694        }
695        cur = parent.parent();
696    }
697    false
698}
699
700/// Extract definitions from a source file using tree-sitter.
701fn extract_definitions(source: &str, config: &languages::LangConfig) -> Vec<Definition> {
702    let mut parser = Parser::new();
703    if parser.set_language(&config.language).is_err() {
704        return vec![];
705    }
706    let Some(tree) = parser.parse(source, None) else {
707        return vec![];
708    };
709
710    let mut cursor = QueryCursor::new();
711    let mut defs = Vec::new();
712    let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
713
714    while let Some(m) = matches.next() {
715        let mut name = String::new();
716        let mut def_node = None;
717
718        for cap in m.captures {
719            let cap_name = &config.query.capture_names()[cap.index as usize];
720            if *cap_name == "name" {
721                name = source[cap.node.start_byte()..cap.node.end_byte()].to_string();
722            } else if *cap_name == "def" {
723                def_node = Some(cap.node);
724            }
725        }
726
727        if let Some(node) = def_node {
728            // Cycle 10 W1 Front A: drop spurious nested assignment / variable_declarator
729            // defs that would steal calls from the enclosing function.
730            if is_spurious_nested_binding_def(node.kind(), node) {
731                continue;
732            }
733            let scope = crate::chunk::build_scope_chain(node, source);
734            let signature = crate::chunk::extract_signature(node, source);
735            #[expect(clippy::cast_possible_truncation, reason = "line numbers fit in u32")]
736            let start_line = node.start_position().row as u32 + 1;
737            #[expect(clippy::cast_possible_truncation, reason = "line numbers fit in u32")]
738            let end_line = node.end_position().row as u32 + 1;
739            #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
740            let start_byte = node.start_byte() as u32;
741            #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
742            let end_byte = node.end_byte() as u32;
743            // C1 (4.1.1): For Python `decorated_definition` nodes, extract the
744            // decorator name and compute the LSP kind at AST-parse time so the
745            // projection site in `render_json_budgeted` does not need to re-parse.
746            let (decorator, lsp_kind_hint) = if node.kind() == "decorated_definition" {
747                let dec = extract_first_decorator_name(&node, source.as_bytes());
748                let kind_hint = languages::lsp_symbol_kind_for_decorated_definition(
749                    dec.as_deref().unwrap_or(""),
750                );
751                (dec, Some(kind_hint))
752            } else {
753                (None, None)
754            };
755            defs.push(Definition {
756                name,
757                kind: node.kind().to_string(),
758                start_line,
759                end_line,
760                scope,
761                signature,
762                start_byte,
763                end_byte,
764                calls: vec![],
765                decorator,
766                lsp_kind_hint,
767            });
768        }
769    }
770
771    defs
772}
773
774// ── Call Extraction & Resolution ────────────────────────────────────
775
776/// Tiebreak priority for def attribution when two defs share the same byte span.
777///
778/// Returns `0` for function-like defs (lowest value = wins in `min_by_key`) and
779/// `1` for structural container defs (class bodies, impl blocks, etc.).
780///
781/// This resolves the Python case where the class body `block` and the first
782/// `function_definition` inside it occupy identical byte ranges; calls inside
783/// the function body should be attributed to the function, not the class block.
784fn is_callable_def_priority(kind: &str) -> u8 {
785    match kind {
786        // Function / method defs: these are the correct attribution targets.
787        "function_item"
788        | "function_definition"
789        | "function_declaration"
790        | "function_signature_item"
791        | "method_definition"
792        | "method_declaration"
793        | "method" => 0,
794        // Structural containers: class body blocks, impl items, etc.
795        // Prefer function-like defs over these when byte ranges tie.
796        _ => 1,
797    }
798}
799
800// ── I#77: Python import aliases ─────────────────────────────────────────
801//
802// `from X import Y as Z` and `import X as Y` rebind the canonical name to a
803// local identifier. Without alias-rewriting, the call-edge extractor records
804// `Z()` and `Y.fn()` under the alias — names that have no entry in the
805// global `def_index` — and the canonical target's `def_callers` stays empty.
806// Same NC11 closure-attribution failure class as I#57 (Rust closures) and
807// I#71 (JS closures).
808//
809// Aliases are scope-limited: a `from X import Y as Z` inside `def use()`
810// must not bind `Z` for sibling functions. The extractor records each alias
811// with the byte range of its enclosing scope (module = whole file, or the
812// `function_definition` body). The smallest enclosing alias wins at the
813// call site (same shadowing rule as Python's runtime).
814
815/// One Python import alias (`from X import Y as Z` or `import X as Y`).
816///
817/// The alias name `local` is what appears at the call site. Resolution
818/// rewrites the call to the canonical form recorded in `canonical_module`
819/// and (for `from`-imports) `canonical_name`.
820#[derive(Debug, Clone)]
821struct PythonAlias {
822    /// Local identifier at the call site (the right-hand side of `as`).
823    local: String,
824    /// Canonical module path (the dotted module name in the import).
825    canonical_module: String,
826    /// Canonical attribute name within the module (`Y` from `from X import Y as Z`).
827    ///
828    /// `None` for `import X as Y` — the alias rebinds the module itself,
829    /// so `Y.fn()` resolves through `canonical_module::fn` at the receiver,
830    /// not by stripping the attribute.
831    canonical_name: Option<String>,
832    /// Scope start byte (inclusive) — first byte of the enclosing scope.
833    scope_start: u32,
834    /// Scope end byte (exclusive) — last byte of the enclosing scope.
835    scope_end: u32,
836}
837
838/// Walk a Python AST and collect every `import X as Y` and
839/// `from X import Y as Z` alias, recording each with the byte range of its
840/// enclosing scope (whole file at module level, `function_definition` body
841/// for function-local imports).
842///
843/// The returned list is sorted by `(scope_end - scope_start)` ascending —
844/// the smallest enclosing scope at any byte offset is the first match for a
845/// given `local` name. (Python's import shadowing rule: an inner-scope
846/// `import X as Y` shadows any outer binding of `Y`.)
847fn extract_python_aliases(source: &str, root: tree_sitter::Node<'_>) -> Vec<PythonAlias> {
848    let mut out: Vec<PythonAlias> = Vec::new();
849    collect_python_aliases_rec(source, root, &mut out);
850    // Smallest scope first so the per-call linear search finds the
851    // narrowest binding before a wider one with the same `local` name.
852    out.sort_by_key(|a| a.scope_end.saturating_sub(a.scope_start));
853    out
854}
855
856/// Recursive helper for [`extract_python_aliases`]. Visits every node and
857/// records aliases on `import_statement` / `import_from_statement`.
858fn collect_python_aliases_rec(
859    source: &str,
860    node: tree_sitter::Node<'_>,
861    out: &mut Vec<PythonAlias>,
862) {
863    match node.kind() {
864        "import_statement" => collect_aliases_import_stmt(source, node, out),
865        "import_from_statement" => collect_aliases_import_from_stmt(source, node, out),
866        _ => {}
867    }
868    let mut cursor = node.walk();
869    for child in node.children(&mut cursor) {
870        collect_python_aliases_rec(source, child, out);
871    }
872}
873
874/// Extract aliases from an `import_statement` node — patterns like
875/// `import X`, `import X.Y`, `import X as Y`, `import X.Y as Z`.
876///
877/// Only `aliased_import` children produce an alias; bare imports are ignored
878/// because the call site already uses the canonical name.
879fn collect_aliases_import_stmt(
880    source: &str,
881    node: tree_sitter::Node<'_>,
882    out: &mut Vec<PythonAlias>,
883) {
884    let scope = enclosing_python_scope(node);
885    let mut cursor = node.walk();
886    for child in node.children(&mut cursor) {
887        if child.kind() != "aliased_import" {
888            continue;
889        }
890        let (Some(name_node), Some(alias_node)) = (
891            child.child_by_field_name("name"),
892            child.child_by_field_name("alias"),
893        ) else {
894            continue;
895        };
896        let canonical_module = source[name_node.start_byte()..name_node.end_byte()].to_string();
897        let local = source[alias_node.start_byte()..alias_node.end_byte()].to_string();
898        out.push(PythonAlias {
899            local,
900            canonical_module,
901            canonical_name: None,
902            scope_start: scope.0,
903            scope_end: scope.1,
904        });
905    }
906}
907
908/// Extract aliases from an `import_from_statement` node — patterns like
909/// `from X import Y`, `from X import Y as Z`, `from X import (Y as Z, W)`.
910///
911/// Only `aliased_import` children produce an alias; bare `from X import Y`
912/// is handled by the existing imported-file resolver (Priority 4) since `Y`
913/// already matches the canonical def name in `X`.
914fn collect_aliases_import_from_stmt(
915    source: &str,
916    node: tree_sitter::Node<'_>,
917    out: &mut Vec<PythonAlias>,
918) {
919    let Some(module_node) = node.child_by_field_name("module_name") else {
920        return;
921    };
922    let canonical_module = source[module_node.start_byte()..module_node.end_byte()].to_string();
923    let scope = enclosing_python_scope(node);
924    let mut cursor = node.walk();
925    for child in node.children(&mut cursor) {
926        if child.kind() != "aliased_import" {
927            continue;
928        }
929        let (Some(name_node), Some(alias_node)) = (
930            child.child_by_field_name("name"),
931            child.child_by_field_name("alias"),
932        ) else {
933            continue;
934        };
935        let canonical_name = source[name_node.start_byte()..name_node.end_byte()].to_string();
936        let local = source[alias_node.start_byte()..alias_node.end_byte()].to_string();
937        out.push(PythonAlias {
938            local,
939            canonical_module: canonical_module.clone(),
940            canonical_name: Some(canonical_name),
941            scope_start: scope.0,
942            scope_end: scope.1,
943        });
944    }
945}
946
947/// Compute the byte range of the smallest enclosing Python scope for an
948/// import statement.
949///
950/// Walks up from `node` until we find a `function_definition` (function-local
951/// import) or run out of ancestors (module-level import, scope = whole file).
952/// Class-level imports are treated as module-level for this purpose: Python
953/// class bodies do not introduce a true lexical scope for nested function
954/// lookups, so the simplest correct rule is "function-local or wider".
955fn enclosing_python_scope(node: tree_sitter::Node<'_>) -> (u32, u32) {
956    let mut cur = node.parent();
957    while let Some(parent) = cur {
958        if parent.kind() == "function_definition" {
959            #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
960            return (parent.start_byte() as u32, parent.end_byte() as u32);
961        }
962        cur = parent.parent();
963    }
964    // Reached the module root — alias is module-level. Use the root node's
965    // range (effectively the whole file).
966    let mut root = node;
967    while let Some(parent) = root.parent() {
968        root = parent;
969    }
970    #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
971    {
972        (root.start_byte() as u32, root.end_byte() as u32)
973    }
974}
975
976/// Find the smallest enclosing alias for a given byte offset and local name.
977///
978/// Returns the first alias (in sorted-by-scope-size order) whose `local`
979/// equals `local_name` and whose scope contains `byte_offset`. Smallest-
980/// scope-first ordering implements Python's shadowing rule: a function-local
981/// `import X as Y` shadows a module-level `import Z as Y`.
982fn lookup_python_alias<'a>(
983    aliases: &'a [PythonAlias],
984    local_name: &str,
985    byte_offset: u32,
986) -> Option<&'a PythonAlias> {
987    aliases.iter().find(|a| {
988        a.local == local_name && a.scope_start <= byte_offset && byte_offset < a.scope_end
989    })
990}
991
992/// Rewrite a Python call's `name` and `qualified_path` through the alias map
993/// in place. Used inside [`extract_calls`] so the resolver sees the canonical
994/// target identifier and qualifier prefix instead of the local alias.
995///
996/// Two patterns are recognised, mirroring the two `aliased_import` shapes:
997///
998/// 1. **`from X import Y as Z; Z(...)`** — `name` is the bare identifier
999///    matching alias `local = Z`. Rewrite `name = Y`, set
1000///    `qualified_path = Some("X::Y")` so Priority 1 (qualified-path)
1001///    fires on the resolver.
1002/// 2. **`import X as Y; Y.fn(...)`** — callee node sits inside an
1003///    `(attribute attribute: (identifier) @callee)` shape. The
1004///    sibling `object:` child is the receiver. If the receiver is a bare
1005///    identifier matching alias `local = Y` (with `canonical_name = None`,
1006///    i.e. an `import`-style rather than `from`-style alias), keep `name`
1007///    as the attribute and set `qualified_path = Some("X::fn")`.
1008///
1009/// A function-local alias shadows a module-level alias of the same name
1010/// because [`extract_python_aliases`] sorts smallest-scope-first.
1011fn rewrite_python_call_via_alias(
1012    source: &str,
1013    aliases: &[PythonAlias],
1014    callee_node: tree_sitter::Node<'_>,
1015    call_byte: u32,
1016    name: &mut String,
1017    qualified_path: &mut Option<String>,
1018) {
1019    // Pattern 1: bare call. The callee node's parent is the `call`
1020    // itself (`(call function: (identifier) @callee)`). If the parent's
1021    // `function:` field is the identifier (not an `attribute`), this is
1022    // a bare call.
1023    let is_bare_call = callee_node
1024        .parent()
1025        .filter(|p| p.kind() == "call")
1026        .and_then(|p| p.child_by_field_name("function"))
1027        .is_some_and(|fn_node| fn_node.id() == callee_node.id());
1028
1029    if is_bare_call && let Some(alias) = lookup_python_alias(aliases, name.as_str(), call_byte) {
1030        // `from X import Y as Z`: rewrite to canonical `Y` with qualifier `X::Y`.
1031        // `import X as Z` called as `Z(...)`: less common but legal; the
1032        // canonical call is `X(...)` and the only sensible qualifier is `X`.
1033        let canonical = alias
1034            .canonical_name
1035            .clone()
1036            .unwrap_or_else(|| alias.canonical_module.clone());
1037        let qpath = match &alias.canonical_name {
1038            Some(cn) => format!("{}::{cn}", alias.canonical_module),
1039            None => alias.canonical_module.clone(),
1040        };
1041        *name = canonical;
1042        *qualified_path = Some(qpath);
1043        return;
1044    }
1045
1046    // Pattern 2: attribute call `receiver.fn()`. The callee node's
1047    // grandparent (`(call function: (attribute attribute: (identifier) @callee)) @call`)
1048    // gives us the attribute node, whose `object` field is the receiver.
1049    //
1050    // We only rewrite when the receiver is a bare identifier matching an
1051    // `import X as Y`-style alias (canonical_name = None). For a
1052    // `from X import Mod as M`-style alias on a receiver, the canonical
1053    // call would be `Mod.fn()` — but `Mod` is an attribute of `X`, not a
1054    // module path we can resolve, so we leave it alone.
1055    let Some(attr_node) = callee_node.parent().filter(|p| p.kind() == "attribute") else {
1056        return;
1057    };
1058    let Some(object_node) = attr_node.child_by_field_name("object") else {
1059        return;
1060    };
1061    if object_node.kind() != "identifier" {
1062        // `a.b.c()` — receiver is itself an attribute. Not a single-alias
1063        // rewrite; leave as-is for the resolver to handle.
1064        return;
1065    }
1066    let receiver = &source[object_node.start_byte()..object_node.end_byte()];
1067    let Some(alias) = lookup_python_alias(aliases, receiver, call_byte) else {
1068        return;
1069    };
1070    if alias.canonical_name.is_some() {
1071        // The alias rebinds a name, not a module — receiver is `M` from
1072        // `from X import Mod as M`. Canonical `Mod.fn()` isn't a path we
1073        // can construct without knowing whether `Mod` is a class or
1074        // submodule. Leave unresolved.
1075        return;
1076    }
1077    // `import X as receiver`: rewrite to qualifier `X::name`. The bare
1078    // `name` (the attribute identifier) is preserved — it is already the
1079    // function name in module `X`.
1080    *qualified_path = Some(format!("{}::{name}", alias.canonical_module));
1081}
1082
1083/// Extract call sites from a source file and assign them to definitions.
1084///
1085/// Uses the language's call query to find all call expressions, then
1086/// assigns each call to the definition whose byte range contains it.
1087/// Calls outside any definition body (module-level) are ignored.
1088///
1089/// For Rust scoped calls (`a::b::foo()`), the `@callee` capture returns the
1090/// full `scoped_identifier` node. This function splits it into:
1091/// - `name` = bare trailing identifier (`"foo"`)
1092/// - `qualified_path` = `Some("a::b::foo")` for disambiguation in `resolve_calls`.
1093///
1094/// For method calls (`x.method()`), `receiver_type` is inferred from local
1095/// context (parameter annotations, let-bindings, impl blocks). See
1096/// [`infer_receiver_types`] for the heuristic.
1097fn extract_calls(source: &str, call_config: &languages::CallConfig, defs: &mut [Definition]) {
1098    let mut parser = Parser::new();
1099    if parser.set_language(&call_config.language).is_err() {
1100        return;
1101    }
1102    let Some(tree) = parser.parse(source, None) else {
1103        return;
1104    };
1105
1106    // Build receiver-type map: byte_offset_of_call → receiver_type_string.
1107    // Done once per file to amortise the tree walk cost.
1108    let receiver_map = infer_receiver_types(source, &tree, &call_config.language);
1109
1110    // HCL: run the HCL-specific call-edge extractor as a post-pass so the
1111    // terraform_remote_state references and module blocks contribute edges
1112    // that the generic function_call query cannot capture (R2 + R3, Wave 3).
1113    if languages::is_hcl_language(&call_config.language) {
1114        extract_hcl_call_edges(source, tree.root_node(), defs);
1115    }
1116
1117    // C / C++ (I#55, 4.1.5): emit synthetic call-graph edges from
1118    // struct-literal initializer fnptrs. C codebases dispatch via tables of
1119    // function pointers (Linux `file_operations`, redis `redisCommandTable`,
1120    // libuv handle vtables) — the generic call-expression query cannot
1121    // capture these because there is no syntactic `f()` call site; the
1122    // function is referenced by bare identifier inside `{ ... }`. Without
1123    // this post-pass, every implementation referenced exclusively via such
1124    // a table appears dead (Part XI §XI.4: kernel mega-cluster collapse on
1125    // Linux, command-implementation collapse on redis).
1126    if languages::is_c_language(&call_config.language)
1127        || languages::is_cpp_language(&call_config.language)
1128    {
1129        extract_c_struct_init_edges(source, tree.root_node(), defs);
1130    }
1131
1132    // I#77: Python import-alias map. For Python source, walk the AST once
1133    // and collect every `import X as Y` / `from X import Y as Z` binding,
1134    // tagged with its enclosing scope's byte range. The per-call lookup
1135    // below rewrites `Y(...)` and `Y.fn(...)` to their canonical targets
1136    // before the resolver runs.
1137    let python_aliases: Vec<PythonAlias> = if languages::is_python_language(&call_config.language) {
1138        extract_python_aliases(source, tree.root_node())
1139    } else {
1140        Vec::new()
1141    };
1142
1143    let mut cursor = QueryCursor::new();
1144    let mut matches = cursor.matches(&call_config.query, tree.root_node(), source.as_bytes());
1145
1146    while let Some(m) = matches.next() {
1147        let mut full_callee_text = None;
1148        let mut call_byte = 0u32;
1149        let mut callee_node: Option<tree_sitter::Node<'_>> = None;
1150
1151        for cap in m.captures {
1152            let cap_name = &call_config.query.capture_names()[cap.index as usize];
1153            if *cap_name == "callee" {
1154                full_callee_text =
1155                    Some(source[cap.node.start_byte()..cap.node.end_byte()].to_string());
1156                #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
1157                {
1158                    call_byte = cap.node.start_byte() as u32;
1159                }
1160                callee_node = Some(cap.node);
1161            }
1162        }
1163
1164        if let Some(full_text) = full_callee_text {
1165            // Split qualified path into bare name + optional qualifier.
1166            let (mut name, mut qualified_path) = if full_text.contains("::") {
1167                let bare = full_text
1168                    .rsplit("::")
1169                    .next()
1170                    .unwrap_or(&full_text)
1171                    .to_string();
1172                (bare, Some(full_text))
1173            } else {
1174                (full_text, None)
1175            };
1176
1177            // I#77: Python alias rewriting. Two cases:
1178            //
1179            // 1. Bare call `baz()` where `from X import Y as baz`:
1180            //    rewrite `name` to canonical `Y` and set
1181            //    `qualified_path = Some("X::Y")` so Priority 1 fires.
1182            //
1183            // 2. Attribute call `m2.fn()` where `import X as m2`:
1184            //    the callee text is `fn`, but the receiver `m2` is an
1185            //    alias for module `X`. Walk to the `attribute object`
1186            //    node to extract the receiver identifier, look it up,
1187            //    and set `qualified_path = Some("X::fn")` so Priority 1
1188            //    fires.
1189            if !python_aliases.is_empty()
1190                && let Some(callee) = callee_node
1191            {
1192                rewrite_python_call_via_alias(
1193                    source,
1194                    &python_aliases,
1195                    callee,
1196                    call_byte,
1197                    &mut name,
1198                    &mut qualified_path,
1199                );
1200            }
1201
1202            // Look up receiver type from the pre-built map.
1203            let receiver_type = receiver_map.get(&call_byte).cloned();
1204
1205            // Assign to the most-specific (smallest byte range) enclosing definition.
1206            // Using `find` (first match) was incorrect for nested defs: an `impl_item`
1207            // wrapping a `function_item` both contain the call site, but the
1208            // `function_item` is the correct granularity for method attribution.
1209            //
1210            // Tiebreak: when two defs have equal byte spans (as happens in Python where
1211            // the class body `block` and its first `function_definition` share the same
1212            // start/end bytes), prefer function-like defs over structural container defs.
1213            // `is_callable_def` returns 0 for function-like kinds (sorts first in min_by_key).
1214            let enclosing_idx = defs
1215                .iter()
1216                .enumerate()
1217                .filter(|(_, d)| d.start_byte <= call_byte && call_byte < d.end_byte)
1218                .min_by_key(|(_, d)| (d.end_byte - d.start_byte, is_callable_def_priority(&d.kind)))
1219                .map(|(i, _)| i);
1220
1221            if let Some(idx) = enclosing_idx {
1222                // Skip self-recursive calls (compare bare name to def name).
1223                if defs[idx].name != name {
1224                    defs[idx].calls.push(CallRef {
1225                        name,
1226                        qualified_path,
1227                        receiver_type,
1228                        byte_offset: call_byte,
1229                        resolved: None,
1230                    });
1231                }
1232            }
1233            // Calls outside any definition are ignored (module-level init).
1234        }
1235    }
1236
1237    // JS / TS / TSX (B-0005): attribute closure-argument call edges to the
1238    // nearest named enclosing function. Arrow functions and function
1239    // expressions passed as arguments (e.g. `useCallback(() => fn(), [deps])`
1240    // or `setTimeout(() => fn(), 0)`) are "passthrough" scopes — their inner
1241    // calls bubble up to the nearest `function_declaration` or
1242    // `method_definition` ancestor. Named const-assigned arrows (`const f =
1243    // () => ...`) are NOT passthrough; they are their own def targets.
1244    //
1245    // This runs as a post-pass so that the main-loop dedup check for existing
1246    // (byte_offset, name) pairs can prevent duplicates from being emitted when
1247    // a call is already attributed to the correct named function.
1248    if is_js_or_ts_language(&call_config.language) {
1249        extract_js_closure_call_edges(source, tree.root_node(), defs);
1250    }
1251}
1252
1253// ── JS / TS closure call-edge attribution ────────────────────────────────────
1254
1255/// Returns `true` if `lang` is one of the JavaScript / TypeScript grammars.
1256///
1257/// Uses the same node-kind-count proxy as [`languages::is_rust_language`].
1258/// JS, TS, and TSX all need closure attribution (B-0005).
1259fn is_js_or_ts_language(lang: &tree_sitter::Language) -> bool {
1260    let js_lang: tree_sitter::Language = tree_sitter_javascript::LANGUAGE.into();
1261    let ts_lang: tree_sitter::Language = tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into();
1262    let tsx_lang: tree_sitter::Language = tree_sitter_typescript::LANGUAGE_TSX.into();
1263    let matches = |reference: tree_sitter::Language| {
1264        lang.abi_version() == reference.abi_version()
1265            && lang.node_kind_count() == reference.node_kind_count()
1266    };
1267    matches(js_lang) || matches(ts_lang) || matches(tsx_lang)
1268}
1269
1270/// Returns `true` if `node` is an `arrow_function` or `function_expression`
1271/// that is passed directly as an argument to another call expression.
1272///
1273/// The check is: the node's immediate parent is an `arguments` node.
1274/// This distinguishes "argument closures" (passthrough for call attribution)
1275/// from "named const arrows" (`const f = () => ...`) whose parent is a
1276/// `variable_declarator`.
1277fn is_argument_closure(node: tree_sitter::Node<'_>) -> bool {
1278    matches!(node.kind(), "arrow_function" | "function_expression")
1279        && node.parent().is_some_and(|p| p.kind() == "arguments")
1280}
1281
1282/// Walk up the tree from `node`, passing through all transparent nodes
1283/// (`arrow_function`, `function_expression`, `statement_block`, `arguments`,
1284/// `call_expression`, `variable_declarator`, `lexical_declaration`,
1285/// `expression_statement`, etc.) until we reach a `function_declaration` or
1286/// `method_definition`, which is the nearest named enclosing function.
1287///
1288/// Returns `None` if no named function is found above `node` (e.g. the
1289/// closure lives at module/file scope).
1290fn nearest_named_fn_ancestor(node: tree_sitter::Node<'_>) -> Option<tree_sitter::Node<'_>> {
1291    let mut current = node.parent()?;
1292    loop {
1293        match current.kind() {
1294            // Named function boundaries: stop here.
1295            "function_declaration" | "method_definition" => return Some(current),
1296            // All other node kinds are transparent: keep climbing.
1297            //
1298            // The explicit list of transparent kinds (closures, declarations,
1299            // control-flow, etc.) is documented here for readability, but the
1300            // wildcard covers any new node kinds added in future grammar
1301            // versions — the walk is intentionally permissive so that
1302            // attribution never silently breaks on grammar updates.
1303            //
1304            // Transparent: "arrow_function", "function_expression",
1305            // "statement_block", "arguments", "call_expression",
1306            // "member_expression", "variable_declarator", "lexical_declaration",
1307            // "expression_statement", "return_statement", "await_expression",
1308            // "class_body", "class_declaration", "export_statement",
1309            // "if_statement", "while_statement", "for_statement", etc.
1310            _ => {}
1311        }
1312        match current.parent() {
1313            Some(p) => current = p,
1314            None => return None,
1315        }
1316    }
1317}
1318
1319/// Collect all `call_expression` callee names that appear directly inside
1320/// `closure` (an `arrow_function` or `function_expression`), visiting only
1321/// one level of closure depth (recursive closures are handled by the outer
1322/// DFS in [`extract_js_closure_call_edges`]).
1323///
1324/// Returns a `Vec` of `(callee_name, byte_offset)` pairs matching the callee
1325/// capture of the JS call query: either a bare `identifier` or the
1326/// `property_identifier` from a `member_expression`.
1327fn collect_calls_in_closure<'a>(
1328    source: &'a str,
1329    closure: tree_sitter::Node<'a>,
1330) -> Vec<(String, u32)> {
1331    let mut results = Vec::new();
1332    let mut stack: Vec<tree_sitter::Node<'_>> = Vec::new();
1333    // Start from the closure body, not the closure node itself.
1334    let body = closure.child_by_field_name("body").unwrap_or(closure);
1335    let mut cursor = body.walk();
1336    for child in body.children(&mut cursor) {
1337        stack.push(child);
1338    }
1339
1340    while let Some(node) = stack.pop() {
1341        if node.kind() == "call_expression" {
1342            // Extract callee: either `function: identifier` or
1343            // `function: member_expression property: property_identifier`.
1344            if let Some(fn_node) = node.child_by_field_name("function") {
1345                let callee_opt: Option<(String, u32)> = match fn_node.kind() {
1346                    "identifier" => {
1347                        let name = source[fn_node.start_byte()..fn_node.end_byte()].to_string();
1348                        #[expect(
1349                            clippy::cast_possible_truncation,
1350                            reason = "byte offsets fit in u32"
1351                        )]
1352                        let byte = fn_node.start_byte() as u32;
1353                        Some((name, byte))
1354                    }
1355                    "member_expression" => fn_node.child_by_field_name("property").map(|prop| {
1356                        let name = source[prop.start_byte()..prop.end_byte()].to_string();
1357                        #[expect(
1358                            clippy::cast_possible_truncation,
1359                            reason = "byte offsets fit in u32"
1360                        )]
1361                        let byte = prop.start_byte() as u32;
1362                        (name, byte)
1363                    }),
1364                    _ => None,
1365                };
1366                if let Some(pair) = callee_opt {
1367                    results.push(pair);
1368                }
1369            }
1370        }
1371        // Recurse into children, but do NOT descend into nested
1372        // arrow_function / function_expression nodes — those are either
1373        // handled by the outer DFS (if they are argument closures) or are
1374        // named const-arrows (which are their own defs and handled by the
1375        // main loop).
1376        if !matches!(node.kind(), "arrow_function" | "function_expression") {
1377            let mut c = node.walk();
1378            for child in node.children(&mut c) {
1379                stack.push(child);
1380            }
1381        }
1382    }
1383    results
1384}
1385
1386/// Post-pass for JS / TS / TSX files.
1387///
1388/// Two sub-passes:
1389///
1390/// **Pass 1 — closure-argument attribution**: For every `arrow_function` or
1391/// `function_expression` that is passed as an argument to a call (i.e., its
1392/// parent is an `arguments` node), attributes the inner calls to the nearest
1393/// named enclosing `function_declaration` or `method_definition`.
1394///
1395/// **Pass 2 — variable-declarator propagation**: For every `variable_declarator`
1396/// def that sits inside a named function, propagates all its calls to the
1397/// enclosing named function. This captures the outer call itself (e.g.
1398/// `useCallback(...)` in `const handler = useCallback(...)`) which the main
1399/// loop attributed to `handler` (the smallest enclosing def) but which also
1400/// belongs to the enclosing React component.
1401///
1402/// Both passes use `(byte_offset, name)` deduplication so calls are never
1403/// duplicated on a single def.
1404///
1405/// This fixes B-0005: React hooks (`useCallback`, `useMemo`, `useEffect`),
1406/// timer APIs (`setTimeout`, `setInterval`), array methods
1407/// (`.map`, `.filter`, `.forEach`, `.reduce`), and Express middleware
1408/// (`app.use`, `app.get`) all pass closures as arguments. Without this
1409/// post-pass, every function called INSIDE such a closure appears dead
1410/// because the call edge is attributed to the anonymous closure (which has
1411/// no def of its own) rather than to the enclosing named function.
1412///
1413/// **Rule** (from DESIGN §B-0005): anonymous closures and arrows are
1414/// "passthrough". Walk up to the nearest `function_declaration` /
1415/// `method_definition`. If none is found (closure at module scope), no edge
1416/// is emitted — matching the existing behaviour for top-level JS calls.
1417fn extract_js_closure_call_edges(
1418    source: &str,
1419    root: tree_sitter::Node<'_>,
1420    defs: &mut [Definition],
1421) {
1422    // Pass 1: walk tree — attribute calls inside argument-closure bodies to
1423    // the nearest named enclosing function.
1424    {
1425        let mut stack: Vec<tree_sitter::Node<'_>> = vec![root];
1426        while let Some(node) = stack.pop() {
1427            if is_argument_closure(node) {
1428                // Find the nearest named function ancestor.
1429                if let Some(named_fn) = nearest_named_fn_ancestor(node) {
1430                    // Identify which def this named function corresponds to by
1431                    // matching byte range and kind.
1432                    #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
1433                    let fn_start = named_fn.start_byte() as u32;
1434                    #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
1435                    let fn_end = named_fn.end_byte() as u32;
1436
1437                    let def_idx = defs.iter().position(|d| {
1438                        d.start_byte == fn_start
1439                            && d.end_byte == fn_end
1440                            && matches!(
1441                                d.kind.as_str(),
1442                                "function_declaration" | "method_definition"
1443                            )
1444                    });
1445
1446                    if let Some(idx) = def_idx {
1447                        // Collect calls inside this closure (one level deep;
1448                        // nested closures are handled by their own DFS iteration).
1449                        let inner_calls = collect_calls_in_closure(source, node);
1450                        for (name, byte_offset) in inner_calls {
1451                            // Skip self-recursive calls.
1452                            if defs[idx].name == name {
1453                                continue;
1454                            }
1455                            // Skip duplicate edges already present in the def.
1456                            if defs[idx]
1457                                .calls
1458                                .iter()
1459                                .any(|c| c.byte_offset == byte_offset && c.name == name)
1460                            {
1461                                continue;
1462                            }
1463                            defs[idx].calls.push(CallRef {
1464                                name,
1465                                qualified_path: None,
1466                                receiver_type: None,
1467                                byte_offset,
1468                                resolved: None,
1469                            });
1470                        }
1471                    }
1472                }
1473            }
1474            // Recurse into all children.
1475            let mut cursor = node.walk();
1476            for child in node.children(&mut cursor) {
1477                stack.push(child);
1478            }
1479        }
1480    }
1481
1482    // Pass 2: propagate calls from variable_declarator defs to their enclosing
1483    // named function defs.
1484    //
1485    // When a JS pattern like `const handler = useCallback(...)` creates a
1486    // `variable_declarator` def named `handler`, the main loop attributes
1487    // calls like `useCallback(...)` to `handler` (the smallest enclosing def).
1488    // Those calls also semantically belong to the enclosing named function
1489    // (e.g. `Component`) because they execute in its runtime context.
1490    //
1491    // We collect (variable_declarator_idx, enclosing_named_fn_idx) pairs first
1492    // to avoid mutating `defs` while iterating over it.
1493    let propagation_pairs: Vec<(usize, usize)> = defs
1494        .iter()
1495        .enumerate()
1496        .filter(|(_, d)| d.kind == "variable_declarator")
1497        .filter_map(|(var_idx, var_def)| {
1498            // Find the smallest named function def that strictly contains this
1499            // variable_declarator (not equal byte range — strict containment).
1500            let enclosing = defs
1501                .iter()
1502                .enumerate()
1503                .filter(|(enc_idx, enc)| {
1504                    *enc_idx != var_idx
1505                        && matches!(
1506                            enc.kind.as_str(),
1507                            "function_declaration" | "method_definition"
1508                        )
1509                        && enc.start_byte <= var_def.start_byte
1510                        && var_def.end_byte <= enc.end_byte
1511                })
1512                .min_by_key(|(_, enc)| enc.end_byte - enc.start_byte)
1513                .map(|(i, _)| i);
1514            enclosing.map(|enc_idx| (var_idx, enc_idx))
1515        })
1516        .collect();
1517
1518    for (var_idx, enc_idx) in propagation_pairs {
1519        // Clone the calls to propagate (avoiding borrow-checker conflict).
1520        let calls_to_propagate: Vec<(String, u32)> = defs[var_idx]
1521            .calls
1522            .iter()
1523            .map(|c| (c.name.clone(), c.byte_offset))
1524            .collect();
1525        for (name, byte_offset) in calls_to_propagate {
1526            // Skip self-recursive edges.
1527            if defs[enc_idx].name == name {
1528                continue;
1529            }
1530            // Skip duplicates.
1531            if defs[enc_idx]
1532                .calls
1533                .iter()
1534                .any(|c| c.byte_offset == byte_offset && c.name == name)
1535            {
1536                continue;
1537            }
1538            defs[enc_idx].calls.push(CallRef {
1539                name,
1540                qualified_path: None,
1541                receiver_type: None,
1542                byte_offset,
1543                resolved: None,
1544            });
1545        }
1546    }
1547}
1548
1549// HCL: post-pass call-edge extraction for terraform_remote_state and module
1550// blocks. These are not function calls — they are HCL-specific structural
1551// references to other Terraform modules — so the generic
1552// `(function_call (identifier) @callee) @call` pattern in
1553// `call_query_for_extension("tf")` cannot capture them. This helper runs
1554// once per HCL file inside `extract_calls` (R2 + R3, Wave 3).
1555
1556/// Walk an HCL parse tree and emit CallRef entries for:
1557///
1558/// 1. `data.terraform_remote_state.<NAME>.outputs.<ATTR>` expressions:
1559///    one CallRef per reference, with `name = NAME` and
1560///    `qualified_path = Some("terraform_remote_state.NAME")`. These
1561///    connect the current file to the named remote-state module's outputs.
1562///
1563/// 2. `module "X" { source = "../X" }` blocks: one CallRef with
1564///    `name = X` (the label) and `qualified_path = Some("module.X")`.
1565///    The module reference connects to the module's directory in
1566///    `resolve_import` (HCL module-source resolution is not implemented
1567///    yet — the qualified_path carrier is the contract; resolve adds the
1568///    file lookup).
1569///
1570/// Each emitted CallRef is attached to the smallest enclosing definition
1571/// by byte range — matching the same heuristic used by `extract_calls`.
1572fn extract_hcl_call_edges(source: &str, root: tree_sitter::Node<'_>, defs: &mut [Definition]) {
1573    // Walk all named descendants iteratively.
1574    let mut stack: Vec<tree_sitter::Node<'_>> = vec![root];
1575    while let Some(node) = stack.pop() {
1576        // Defer to a function-style helper per node kind.
1577        match node.kind() {
1578            "expression" => hcl_visit_expression(source, node, defs),
1579            "block" => hcl_visit_block(source, node, defs),
1580            _ => {}
1581        }
1582        // Recurse into named children.
1583        let mut cursor = node.walk();
1584        for child in node.children(&mut cursor) {
1585            if child.is_named() {
1586                stack.push(child);
1587            }
1588        }
1589    }
1590}
1591
1592/// Inspect an HCL `expression` node for the
1593/// `data.terraform_remote_state.<NAME>.outputs.<ATTR>` reference pattern.
1594///
1595/// The expression tree looks like:
1596/// ```text
1597/// expression
1598///   variable_expr
1599///     identifier "data"
1600///   get_attr
1601///     identifier "terraform_remote_state"
1602///   get_attr
1603///     identifier "<NAME>"
1604///   get_attr
1605///     identifier "outputs"
1606///   get_attr
1607///     identifier "<ATTR>"
1608/// ```
1609fn hcl_visit_expression(source: &str, node: tree_sitter::Node<'_>, defs: &mut [Definition]) {
1610    // Collect children: must be `variable_expr` (with identifier="data")
1611    // followed by a chain of `get_attr` nodes (each with an `identifier` child).
1612    let mut cursor = node.walk();
1613    let mut child_iter = node.children(&mut cursor);
1614    let Some(first) = child_iter.next() else {
1615        return;
1616    };
1617    if first.kind() != "variable_expr" {
1618        return;
1619    }
1620    let Some(first_id) = first.child_by_field_name("name").or_else(|| {
1621        // Fallback: find first named child that's an identifier.
1622        let mut c = first.walk();
1623        first.children(&mut c).find(|n| n.kind() == "identifier")
1624    }) else {
1625        return;
1626    };
1627    if &source[first_id.start_byte()..first_id.end_byte()] != "data" {
1628        return;
1629    }
1630
1631    // Collect identifiers from the chain of get_attr.
1632    let mut chain: Vec<String> = Vec::new();
1633    for child in child_iter {
1634        if child.kind() != "get_attr" {
1635            return; // not a pure attribute chain
1636        }
1637        let mut gc = child.walk();
1638        let id = child.children(&mut gc).find(|n| n.kind() == "identifier");
1639        let Some(id_node) = id else { return };
1640        chain.push(source[id_node.start_byte()..id_node.end_byte()].to_string());
1641    }
1642
1643    // Expect: terraform_remote_state, <NAME>, outputs, <ATTR>
1644    if chain.len() < 2 || chain[0] != "terraform_remote_state" {
1645        return;
1646    }
1647    let name = chain[1].clone();
1648    let qualified_path = format!("terraform_remote_state.{name}");
1649
1650    #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
1651    let call_byte = node.start_byte() as u32;
1652    attach_hcl_call(defs, call_byte, name.clone(), Some(qualified_path));
1653
1654    // I#54a — additionally emit an `outputs.<ATTR>` edge when the chain
1655    // reaches `outputs.<ATTR>`. This is the dependency edge a non-trivial
1656    // IaC repo actually has: the consumer references a *specific* upstream
1657    // output by name (`outputs.bar`), and the upstream tfstate file
1658    // contains an `output "bar" { ... }` block whose def name is exactly
1659    // `bar`. Emitting this second edge — with `name = ATTR` and a
1660    // qualified path that records the full chain — lets `resolve_calls`
1661    // bind the consumer to the upstream file regardless of path layout
1662    // (Aurora-style `infrastructure/shared/main.tf` and the simpler
1663    // `infra/foo.tf` shape both work).
1664    //
1665    // The legacy label edge above is preserved so the path-segment
1666    // resolution branch in `resolve_calls_inner` keeps firing on
1667    // existing corpora.
1668    if chain.len() >= 4 && chain[2] == "outputs" {
1669        let attr = chain[3].clone();
1670        let attr_qpath = format!("terraform_remote_state.{name}.outputs.{attr}");
1671        attach_hcl_call(defs, call_byte, attr, Some(attr_qpath));
1672    }
1673}
1674
1675/// Inspect an HCL `block` node for the `module "X" { source = "../X" }`
1676/// pattern. Emits one CallRef per matching block.
1677fn hcl_visit_block(source: &str, node: tree_sitter::Node<'_>, defs: &mut [Definition]) {
1678    // First child must be identifier="module".
1679    let mut cursor = node.walk();
1680    let children: Vec<tree_sitter::Node<'_>> = node.children(&mut cursor).collect();
1681    let Some(first) = children.first() else {
1682        return;
1683    };
1684    if first.kind() != "identifier" || &source[first.start_byte()..first.end_byte()] != "module" {
1685        return;
1686    }
1687    // Next child should be a string_lit (the module label).
1688    let label_node = children.iter().find(|c| c.kind() == "string_lit");
1689    let Some(label_node) = label_node else {
1690        return;
1691    };
1692    let mut lc = label_node.walk();
1693    let template = label_node
1694        .children(&mut lc)
1695        .find(|n| n.kind() == "template_literal");
1696    let Some(template) = template else {
1697        return;
1698    };
1699    let label = source[template.start_byte()..template.end_byte()].to_string();
1700    let qualified_path = format!("module.{label}");
1701
1702    #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
1703    let call_byte = node.start_byte() as u32;
1704    attach_hcl_call(defs, call_byte, label, Some(qualified_path));
1705}
1706
1707/// Attach a synthesized HCL CallRef to the smallest enclosing definition.
1708/// Mirrors the byte-range attribution from `extract_calls`.
1709fn attach_hcl_call(
1710    defs: &mut [Definition],
1711    call_byte: u32,
1712    name: String,
1713    qualified_path: Option<String>,
1714) {
1715    let enclosing_idx = defs
1716        .iter()
1717        .enumerate()
1718        .filter(|(_, d)| d.start_byte <= call_byte && call_byte < d.end_byte)
1719        .min_by_key(|(_, d)| (d.end_byte - d.start_byte, is_callable_def_priority(&d.kind)))
1720        .map(|(i, _)| i);
1721    if let Some(idx) = enclosing_idx {
1722        // Skip self-recursive emission (would happen if the enclosing def
1723        // happens to share the same `name` as the synthesized callee).
1724        if defs[idx].name != name {
1725            defs[idx].calls.push(CallRef {
1726                name,
1727                qualified_path,
1728                receiver_type: None,
1729                byte_offset: call_byte,
1730                resolved: None,
1731            });
1732        }
1733    }
1734}
1735
1736/// Walk a C / C++ parse tree and emit synthetic CallRef entries for every
1737/// function identifier appearing inside a struct or array initializer
1738/// literal (I#55, 4.1.5).
1739///
1740/// Two syntactic forms produce edges:
1741///
1742/// 1. **Designated initializer** — `(initializer_pair value: (identifier))`
1743///    — corresponds to `.field = funcname` syntax (Linux `file_operations`,
1744///    ALSA `snd_pcm_ops`, etc):
1745///    ```c
1746///    static const struct file_operations my_fops = {
1747///        .read = my_read,    //  ← edge: my_fops → my_read
1748///        .write = my_write,  //  ← edge: my_fops → my_write
1749///    };
1750///    ```
1751///
1752/// 2. **Positional initializer** — `(initializer_list (identifier))` —
1753///    corresponds to bare identifier slots inside `{ ... }` (redis
1754///    `redisCommandTable`, libuv handle vtables):
1755///    ```c
1756///    struct redisCommand cmds[] = {
1757///        {"get", getCommand, 2},   //  ← edge: cmds → getCommand
1758///        {"set", setCommand, -3},  //  ← edge: cmds → setCommand
1759///    };
1760///    ```
1761///
1762/// Each emitted CallRef is attached to the smallest enclosing definition by
1763/// byte range (same heuristic as `extract_calls` / `attach_hcl_call`). For
1764/// the typical use case the enclosing def is the array/struct declaration
1765/// itself (e.g. `my_fops` or `cmds`). When the enclosing def is itself the
1766/// referenced function (e.g. a struct field designator inside a function
1767/// body) the self-recursive edge is suppressed.
1768///
1769/// Non-identifier initializer values (string literals, integers, nested
1770/// braces) are skipped by tree-sitter's `kind() == "identifier"` filter, so
1771/// no false-positive edges to undefined symbols are emitted. The resolver
1772/// (`resolve_calls`) then either binds the identifier to a real function
1773/// def (preserved as a real edge) or leaves `resolved = None` (dropped at
1774/// edge-construction time, matching how all unresolved CallRefs behave).
1775fn extract_c_struct_init_edges(source: &str, root: tree_sitter::Node<'_>, defs: &mut [Definition]) {
1776    // Iterative DFS to avoid stack-blowup on deeply-nested initializer
1777    // tables (some kernel drivers nest 4–5 levels of struct-of-struct).
1778    let mut stack: Vec<tree_sitter::Node<'_>> = vec![root];
1779    while let Some(node) = stack.pop() {
1780        match node.kind() {
1781            // Designated initializer: `.field = funcname`. The grammar
1782            // exposes the rhs as a positional child, but using the `value:`
1783            // field selector picks it unambiguously when present and falls
1784            // back to the last named child otherwise. We only consume bare
1785            // identifier values — string_literal, number_literal, and
1786            // nested initializer_list are dispatched separately (the
1787            // recursion below re-enters nested initializer_list nodes).
1788            "initializer_pair" => {
1789                let value_node = node.child_by_field_name("value").or_else(|| {
1790                    // Fallback: last named child that isn't a designator.
1791                    let mut c = node.walk();
1792                    node.children(&mut c)
1793                        .filter(|n| n.is_named() && n.kind() != "field_designator")
1794                        .last()
1795                });
1796                if let Some(v) = value_node
1797                    && v.kind() == "identifier"
1798                {
1799                    emit_c_init_edge(source, v, defs);
1800                }
1801            }
1802            // Positional initializer: bare identifier directly inside an
1803            // `initializer_list`. We do NOT recurse from here to pick up
1804            // identifiers — the outer DFS already visits every node, and
1805            // emitting on direct identifier children of `initializer_list`
1806            // covers `{"name", funcname, 2}` patterns. Nested
1807            // `initializer_list` children (e.g. `{{...},{...}}`) are
1808            // popped onto the stack and processed in their own iteration.
1809            "initializer_list" => {
1810                let mut c = node.walk();
1811                for child in node.children(&mut c) {
1812                    if child.kind() == "identifier" {
1813                        emit_c_init_edge(source, child, defs);
1814                    }
1815                }
1816            }
1817            _ => {}
1818        }
1819        // Recurse into named children. The DFS visits the entire subtree
1820        // so nested `initializer_pair` / `initializer_list` nodes are
1821        // reached without special handling.
1822        let mut cursor = node.walk();
1823        for child in node.children(&mut cursor) {
1824            if child.is_named() {
1825                stack.push(child);
1826            }
1827        }
1828    }
1829}
1830
1831/// Emit one synthetic CallRef edge for a C/C++ struct-literal fnptr
1832/// reference. `ident_node` must be an `identifier` node; its text becomes
1833/// the callee name. The edge is attached to the smallest enclosing
1834/// definition (typically the surrounding `declaration` def for the table
1835/// variable itself).
1836fn emit_c_init_edge(source: &str, ident_node: tree_sitter::Node<'_>, defs: &mut [Definition]) {
1837    let name = source[ident_node.start_byte()..ident_node.end_byte()].to_string();
1838    #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
1839    let call_byte = ident_node.start_byte() as u32;
1840
1841    let enclosing_idx = defs
1842        .iter()
1843        .enumerate()
1844        .filter(|(_, d)| d.start_byte <= call_byte && call_byte < d.end_byte)
1845        .min_by_key(|(_, d)| (d.end_byte - d.start_byte, is_callable_def_priority(&d.kind)))
1846        .map(|(i, _)| i);
1847
1848    let Some(idx) = enclosing_idx else {
1849        return;
1850    };
1851    // Skip self-recursive emission (would happen if the enclosing def is
1852    // itself the referenced function — e.g. a static initializer inside a
1853    // function body whose .field designates the same function).
1854    if defs[idx].name == name {
1855        return;
1856    }
1857    // Skip duplicate edges (same caller, same name, same byte offset) —
1858    // belt-and-braces against the DFS visiting an identifier through both
1859    // its parent initializer_pair and a containing initializer_list scan.
1860    if defs[idx]
1861        .calls
1862        .iter()
1863        .any(|c| c.byte_offset == call_byte && c.name == name)
1864    {
1865        return;
1866    }
1867    defs[idx].calls.push(CallRef {
1868        name,
1869        qualified_path: None,
1870        receiver_type: None,
1871        byte_offset: call_byte,
1872        resolved: None,
1873    });
1874}
1875
1876/// Infer method-call receiver types from local context within a parse tree.
1877///
1878/// Returns a map from `byte_offset_of_@callee_capture` to a receiver type string.
1879///
1880/// Dispatches to a language-specific collector:
1881///
1882/// - **Rust**: [`collect_rust_receiver_types`] — three heuristic cases:
1883///   1. `self.method()` inside `impl Foo { … }` → `"Foo"`.
1884///   2. `x.method()` where `x: Bar` is a function parameter → `"Bar"`.
1885///   3. `x.method()` after `let x = Foo::new()` → `"Foo"`.
1886///
1887/// - **Python**: [`collect_python_receiver_types`] — two heuristic cases:
1888///   1. `self.method()` inside a class method → class name from enclosing
1889///      `class_definition`.
1890///   2. `instance.method()` where `instance: ClassName` type annotation or
1891///      `instance = ClassName(...)` assignment is visible in the same scope.
1892///
1893/// - **Go**: [`collect_go_receiver_types`] — one heuristic case:
1894///   1. `recv.Method()` inside a `method_declaration` where `recv` is the
1895///      named receiver parameter → receiver type from the method signature.
1896///
1897/// This is heuristic, not type-inference-complete. Unknown/ambiguous cases
1898/// produce no entry in the map; `extract_calls` leaves those `receiver_type = None`.
1899fn infer_receiver_types(
1900    source: &str,
1901    tree: &tree_sitter::Tree,
1902    language: &tree_sitter::Language,
1903) -> HashMap<u32, String> {
1904    let mut map: HashMap<u32, String> = HashMap::new();
1905
1906    if languages::is_rust_language(language) {
1907        collect_rust_receiver_types(source, tree.root_node(), &mut map);
1908    } else if languages::is_python_language(language) {
1909        collect_python_receiver_types(source, tree.root_node(), &mut map);
1910    } else if languages::is_go_language(language) {
1911        collect_go_receiver_types(source, tree.root_node(), &mut map);
1912    }
1913    // Other languages: no receiver inference — leave map empty.
1914
1915    map
1916}
1917
1918/// Walk the Rust parse tree and fill `map` with receiver-type inference.
1919///
1920/// This is a recursive descent that tracks:
1921/// - The current `impl Foo` or `impl Foo for Bar` type name (for `self.*` calls).
1922/// - Parameter type annotations (for `x: SomeType` → `x` has type `SomeType`).
1923/// - Constructor let-bindings (`let x = Foo::new()` → `x` has type `Foo`).
1924fn collect_rust_receiver_types(
1925    source: &str,
1926    node: tree_sitter::Node<'_>,
1927    map: &mut HashMap<u32, String>,
1928) {
1929    // We use a stack-based walk to avoid deep recursion on large files.
1930    // Each stack entry carries (node, impl_type_context).
1931    let mut stack: Vec<(tree_sitter::Node<'_>, Option<String>)> = vec![(node, None)];
1932
1933    while let Some((n, impl_ctx)) = stack.pop() {
1934        match n.kind() {
1935            "impl_item" => {
1936                // Extract `impl Foo` or `impl Trait for Foo` → capture the `for` type.
1937                // tree-sitter-rust shape: `(impl_item type: (type_identifier) @type)` for
1938                // inherent impls, and `(impl_item trait: … type: (type_identifier) @type)`
1939                // for trait impls. Both have a child named `type`.
1940                let impl_type = extract_impl_self_type(source, n);
1941                let new_ctx = impl_type.or_else(|| impl_ctx.clone());
1942                let mut cursor = n.walk();
1943                for child in n.children(&mut cursor) {
1944                    stack.push((child, new_ctx.clone()));
1945                }
1946            }
1947            "function_item" => {
1948                // Build parameter bindings: (param_name → type_name).
1949                let param_types = extract_param_types(source, n);
1950                // Build let-binding type map from constructor calls.
1951                let let_types = extract_let_binding_types(source, n);
1952                // Annotate call sites within this function body.
1953                annotate_method_calls(
1954                    source,
1955                    n,
1956                    impl_ctx.as_deref(),
1957                    &param_types,
1958                    &let_types,
1959                    map,
1960                );
1961                // Do NOT recurse into function_item children with the outer stack —
1962                // function bodies are fully handled by annotate_method_calls.
1963                // (Nested fn items would re-enter via their own impl_item context.)
1964                // Push children with same impl_ctx so nested impl blocks are found.
1965                let mut cursor = n.walk();
1966                for child in n.children(&mut cursor) {
1967                    stack.push((child, impl_ctx.clone()));
1968                }
1969            }
1970            _ => {
1971                let mut cursor = n.walk();
1972                for child in n.children(&mut cursor) {
1973                    stack.push((child, impl_ctx.clone()));
1974                }
1975            }
1976        }
1977    }
1978}
1979
1980/// Extract the self type from an `impl_item` node.
1981///
1982/// For `impl Foo { … }` → `Some("Foo")`.
1983/// For `impl Trait for Foo { … }` → `Some("Foo")` (the concrete `for` type).
1984fn extract_impl_self_type(source: &str, impl_node: tree_sitter::Node<'_>) -> Option<String> {
1985    // tree-sitter-rust: impl_item has a field named "type" for the self type.
1986    // For `impl Foo for Bar { }`, "type" is Bar; for `impl Foo { }`, "type" is Foo.
1987    let type_node = impl_node.child_by_field_name("type")?;
1988    Some(source[type_node.start_byte()..type_node.end_byte()].to_string())
1989}
1990
1991/// Extract parameter name → type mappings from a function signature.
1992///
1993/// Handles `fn foo(x: Bar, y: Baz)` → `{"x": "Bar", "y": "Baz"}`.
1994/// The `self`/`&self`/`&mut self` parameter is skipped (handled via impl_ctx).
1995fn extract_param_types(source: &str, fn_node: tree_sitter::Node<'_>) -> HashMap<String, String> {
1996    let mut params: HashMap<String, String> = HashMap::new();
1997    let Some(params_node) = fn_node.child_by_field_name("parameters") else {
1998        return params;
1999    };
2000    let mut cursor = params_node.walk();
2001    for param in params_node.children(&mut cursor) {
2002        if param.kind() == "parameter" {
2003            // parameter has children: pattern (identifier) and type
2004            let mut param_name = None;
2005            let mut param_type = None;
2006            let mut pc = param.walk();
2007            for child in param.children(&mut pc) {
2008                match child.kind() {
2009                    "identifier" | "mutable_specifier" if param_name.is_none() => {
2010                        let text = source[child.start_byte()..child.end_byte()].to_string();
2011                        if text != "mut" {
2012                            param_name = Some(text);
2013                        }
2014                    }
2015                    "type_identifier"
2016                    | "generic_type"
2017                    | "reference_type"
2018                    | "scoped_type_identifier"
2019                        if param_type.is_none() =>
2020                    {
2021                        // Extract the base type identifier from potentially complex types.
2022                        param_type = Some(extract_base_type(source, child));
2023                    }
2024                    _ => {}
2025                }
2026            }
2027            if let (Some(name), Some(ty)) = (param_name, param_type)
2028                && !ty.is_empty()
2029            {
2030                params.insert(name, ty);
2031            }
2032        }
2033        // Also handle typed_pattern in newer grammars
2034        if param.kind() == "typed_pattern" {
2035            let mut name_part = None;
2036            let mut type_part = None;
2037            let mut pc = param.walk();
2038            for child in param.children(&mut pc) {
2039                if child.kind() == "identifier" && name_part.is_none() {
2040                    name_part = Some(source[child.start_byte()..child.end_byte()].to_string());
2041                } else if matches!(
2042                    child.kind(),
2043                    "type_identifier"
2044                        | "generic_type"
2045                        | "reference_type"
2046                        | "scoped_type_identifier"
2047                ) && type_part.is_none()
2048                {
2049                    type_part = Some(extract_base_type(source, child));
2050                }
2051            }
2052            if let (Some(name), Some(ty)) = (name_part, type_part)
2053                && !ty.is_empty()
2054            {
2055                params.insert(name, ty);
2056            }
2057        }
2058    }
2059    params
2060}
2061
2062/// Extract the base `TypeIdentifier` from a potentially complex type node.
2063///
2064/// For `Bar`, `&Bar`, `&mut Bar`, `Bar<T>` → returns `"Bar"`.
2065/// For `module::Bar` → returns `"Bar"` (bare name for matching).
2066fn extract_base_type(source: &str, node: tree_sitter::Node<'_>) -> String {
2067    match node.kind() {
2068        "type_identifier" => source[node.start_byte()..node.end_byte()].to_string(),
2069        "generic_type" | "reference_type" | "mutable_specifier" | "scoped_type_identifier" => {
2070            // Recurse to find the innermost type_identifier
2071            let mut cursor = node.walk();
2072            for child in node.children(&mut cursor) {
2073                let t = extract_base_type(source, child);
2074                if !t.is_empty() {
2075                    return t;
2076                }
2077            }
2078            String::new()
2079        }
2080        _ => {
2081            // For other nodes, try children
2082            let mut cursor = node.walk();
2083            for child in node.children(&mut cursor) {
2084                if child.kind() == "type_identifier" {
2085                    return source[child.start_byte()..child.end_byte()].to_string();
2086                }
2087            }
2088            String::new()
2089        }
2090    }
2091}
2092
2093/// Scan a function body for `let x = Foo::new()` patterns.
2094///
2095/// Returns a map from local variable name to the constructor type name.
2096/// E.g., `let x = Foo::new();` → `{"x": "Foo"}`.
2097fn extract_let_binding_types(
2098    source: &str,
2099    fn_node: tree_sitter::Node<'_>,
2100) -> HashMap<String, String> {
2101    let mut bindings: HashMap<String, String> = HashMap::new();
2102
2103    let Some(body) = fn_node.child_by_field_name("body") else {
2104        return bindings;
2105    };
2106
2107    // Walk the function body looking for let_declaration nodes.
2108    let mut stack = vec![body];
2109    while let Some(n) = stack.pop() {
2110        if n.kind() == "let_declaration" {
2111            // let_declaration: pattern (identifier) + value (call_expression or …)
2112            let mut binding_name = None;
2113            let mut constructor_type = None;
2114            let mut cursor = n.walk();
2115            for child in n.children(&mut cursor) {
2116                match child.kind() {
2117                    "identifier" if binding_name.is_none() => {
2118                        binding_name =
2119                            Some(source[child.start_byte()..child.end_byte()].to_string());
2120                    }
2121                    "call_expression" => {
2122                        // Look for `Foo::new()` or `Foo::from(…)` patterns.
2123                        // The function child of call_expression is a scoped_identifier.
2124                        if let Some(func) = child.child_by_field_name("function")
2125                            && func.kind() == "scoped_identifier"
2126                        {
2127                            // scoped_identifier path: `Foo::new` — extract head segment.
2128                            let full = source[func.start_byte()..func.end_byte()].to_string();
2129                            let head = full.split("::").next().unwrap_or("").to_string();
2130                            if !head.is_empty()
2131                                && head.chars().next().is_some_and(char::is_uppercase)
2132                            {
2133                                constructor_type = Some(head);
2134                            }
2135                        }
2136                    }
2137                    _ => {}
2138                }
2139            }
2140            if let (Some(name), Some(ty)) = (binding_name, constructor_type) {
2141                bindings.insert(name, ty);
2142            }
2143        }
2144        // Push children for recursive walk.
2145        let mut cursor = n.walk();
2146        for child in n.children(&mut cursor) {
2147            stack.push(child);
2148        }
2149    }
2150
2151    bindings
2152}
2153
2154/// Walk a function body and annotate method-call byte offsets with receiver types.
2155///
2156/// A "method call" in the ripvec call query is:
2157/// `(call_expression function: (field_expression field: (field_identifier) @callee))`
2158///
2159/// The receiver is the `value` child of `field_expression`. This function
2160/// checks whether the receiver is:
2161/// - `self` → use `impl_ctx` type.
2162/// - An identifier matching a parameter type in `param_types`.
2163/// - An identifier matching a constructor let-binding in `let_types`.
2164fn annotate_method_calls(
2165    source: &str,
2166    fn_node: tree_sitter::Node<'_>,
2167    impl_ctx: Option<&str>,
2168    param_types: &HashMap<String, String>,
2169    let_types: &HashMap<String, String>,
2170    map: &mut HashMap<u32, String>,
2171) {
2172    // Walk the entire function (including its body) looking for call_expression nodes.
2173    let mut stack = vec![fn_node];
2174    while let Some(n) = stack.pop() {
2175        if n.kind() == "call_expression"
2176            && let Some(func) = n.child_by_field_name("function")
2177            && func.kind() == "field_expression"
2178        {
2179            // field_expression: value (receiver) + field (method name identifier)
2180            if let (Some(recv), Some(field)) = (
2181                func.child_by_field_name("value"),
2182                func.child_by_field_name("field"),
2183            ) {
2184                let recv_text = source[recv.start_byte()..recv.end_byte()].to_string();
2185                let receiver_type = if recv_text == "self" || recv_text == "*self" {
2186                    impl_ctx.map(str::to_owned)
2187                } else {
2188                    // Strip ref sigils for lookup.
2189                    let base = recv_text
2190                        .trim_start_matches('*')
2191                        .trim_start_matches('&')
2192                        .trim();
2193                    param_types
2194                        .get(base)
2195                        .or_else(|| let_types.get(base))
2196                        .cloned()
2197                };
2198
2199                if let Some(ty) = receiver_type {
2200                    // The `@callee` capture byte offset is the start of the field node.
2201                    #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
2202                    let field_byte = field.start_byte() as u32;
2203                    map.insert(field_byte, ty);
2204                }
2205            }
2206        }
2207        let mut cursor = n.walk();
2208        for child in n.children(&mut cursor) {
2209            stack.push(child);
2210        }
2211    }
2212}
2213
2214// ── Python receiver-type heuristic ───────────────────────────────────
2215
2216/// Walk the Python parse tree and fill `map` with receiver-type inference.
2217///
2218/// Two heuristic cases:
2219///
2220/// 1. **`self.method()` inside a method** — when the `attribute` call receiver is
2221///    the literal text `self`, the receiver type is the name of the nearest
2222///    enclosing `class_definition`.
2223///
2224/// 2. **`instance.method()` with a type annotation or constructor call** —
2225///    when a function parameter has a PEP 484 annotation `param: ClassName` or
2226///    when a local assignment `param = ClassName(...)` precedes the call, the
2227///    receiver type is bound to `ClassName`.
2228///
2229/// The Python call query captures:
2230/// - `(call function: (attribute attribute: (identifier) @callee)) @call`
2231///
2232/// Within the `attribute` node, `value` is the receiver expression and
2233/// `attribute` is the method name (the `@callee` capture). The `@callee`
2234/// byte offset is the start of the `attribute` child identifier node.
2235fn collect_python_receiver_types(
2236    source: &str,
2237    root: tree_sitter::Node<'_>,
2238    map: &mut HashMap<u32, String>,
2239) {
2240    // Stack carries (node, class_ctx: Option<String>).
2241    // class_ctx is the name of the nearest enclosing class_definition.
2242    let mut stack: Vec<(tree_sitter::Node<'_>, Option<String>)> = vec![(root, None)];
2243
2244    while let Some((n, class_ctx)) = stack.pop() {
2245        match n.kind() {
2246            "class_definition" => {
2247                // Extract the class name from the `name` child.
2248                let class_name = n
2249                    .child_by_field_name("name")
2250                    .map(|c| source[c.start_byte()..c.end_byte()].to_string());
2251                let new_ctx = class_name.or_else(|| class_ctx.clone());
2252                let mut cursor = n.walk();
2253                for child in n.children(&mut cursor) {
2254                    stack.push((child, new_ctx.clone()));
2255                }
2256            }
2257            "function_definition" => {
2258                // Build parameter annotation map: param_name → type_name.
2259                let param_types = extract_python_param_types(source, n);
2260                // Build local assignment map: var_name → constructor type.
2261                let let_types = extract_python_assignment_types(source, n);
2262                // Annotate attribute call sites within this function body.
2263                annotate_python_method_calls(
2264                    source,
2265                    n,
2266                    class_ctx.as_deref(),
2267                    &param_types,
2268                    &let_types,
2269                    map,
2270                );
2271                // Push children with same class_ctx so nested classes are found.
2272                let mut cursor = n.walk();
2273                for child in n.children(&mut cursor) {
2274                    stack.push((child, class_ctx.clone()));
2275                }
2276            }
2277            _ => {
2278                let mut cursor = n.walk();
2279                for child in n.children(&mut cursor) {
2280                    stack.push((child, class_ctx.clone()));
2281                }
2282            }
2283        }
2284    }
2285}
2286
2287/// Extract Python parameter name → type annotation mappings.
2288///
2289/// Handles PEP 484 style: `def foo(self, x: Bar, y: Baz) -> ...`.
2290/// The `self` parameter is excluded (handled via class_ctx).
2291/// Returns `{"x": "Bar", "y": "Baz"}`.
2292fn extract_python_param_types(
2293    source: &str,
2294    fn_node: tree_sitter::Node<'_>,
2295) -> HashMap<String, String> {
2296    let mut params: HashMap<String, String> = HashMap::new();
2297    let Some(params_node) = fn_node.child_by_field_name("parameters") else {
2298        return params;
2299    };
2300
2301    // Parameters node children include `identifier`, `typed_parameter`,
2302    // `typed_default_parameter`, and others.
2303    let mut cursor = params_node.walk();
2304    for param in params_node.children(&mut cursor) {
2305        match param.kind() {
2306            "typed_parameter" => {
2307                // (typed_parameter (identifier) @name type: (type) @type)
2308                // First identifier child is the name; type child is the type.
2309                let mut name_text = None;
2310                let mut type_text = None;
2311                let mut pc = param.walk();
2312                for child in param.children(&mut pc) {
2313                    match child.kind() {
2314                        "identifier" if name_text.is_none() => {
2315                            let t = source[child.start_byte()..child.end_byte()].to_string();
2316                            if t != "self" && t != "cls" {
2317                                name_text = Some(t);
2318                            }
2319                        }
2320                        "type" | "identifier" | "attribute"
2321                            if type_text.is_none() && name_text.is_some() =>
2322                        {
2323                            // The type child in tree-sitter-python is a `type` node
2324                            // whose text is the annotation expression. Extract the
2325                            // base identifier (handle `Optional[Bar]`, `List[Bar]`, etc.)
2326                            type_text = Some(extract_python_base_type(source, child));
2327                        }
2328                        _ => {}
2329                    }
2330                }
2331                if let (Some(name), Some(ty)) = (name_text, type_text)
2332                    && !ty.is_empty()
2333                    && !ty.eq("self")
2334                    && !ty.eq("cls")
2335                {
2336                    params.insert(name, ty);
2337                }
2338            }
2339            "typed_default_parameter" => {
2340                // (typed_default_parameter name: (identifier) type: (type) value: …)
2341                let name_node = param.child_by_field_name("name");
2342                let type_node = param.child_by_field_name("type");
2343                if let (Some(nn), Some(tn)) = (name_node, type_node) {
2344                    let name = source[nn.start_byte()..nn.end_byte()].to_string();
2345                    if name != "self" && name != "cls" {
2346                        let ty = extract_python_base_type(source, tn);
2347                        if !ty.is_empty() {
2348                            params.insert(name, ty);
2349                        }
2350                    }
2351                }
2352            }
2353            _ => {}
2354        }
2355    }
2356    params
2357}
2358
2359/// Extract the base type name from a Python type annotation node.
2360///
2361/// For `Bar` → `"Bar"`. For `Optional[Bar]` or `List[Bar]` → `"Bar"`.
2362/// For `module.Class` → `"Class"` (bare name only).
2363fn extract_python_base_type(source: &str, node: tree_sitter::Node<'_>) -> String {
2364    match node.kind() {
2365        "identifier" => source[node.start_byte()..node.end_byte()].to_string(),
2366        // tree-sitter-python wraps annotations in a `type` node
2367        "type" => {
2368            let mut cursor = node.walk();
2369            for child in node.children(&mut cursor) {
2370                let t = extract_python_base_type(source, child);
2371                if !t.is_empty() {
2372                    return t;
2373                }
2374            }
2375            String::new()
2376        }
2377        // Generic alias: `Optional[Bar]` — the first identifier child is `Optional`,
2378        // the subscript child contains `Bar`. We want the subscript content.
2379        "subscript" => {
2380            // subscript has value (e.g. Optional) and subscript (e.g. Bar).
2381            // Return the subscript's base type (the inner type argument).
2382            if let Some(sub) = node.child_by_field_name("subscript") {
2383                return extract_python_base_type(source, sub);
2384            }
2385            // Fall back: first identifier
2386            let mut cursor = node.walk();
2387            for child in node.children(&mut cursor) {
2388                if child.kind() == "identifier" {
2389                    return source[child.start_byte()..child.end_byte()].to_string();
2390                }
2391            }
2392            String::new()
2393        }
2394        // Attribute node `module.Class` → take last identifier
2395        "attribute" => {
2396            if let Some(attr) = node.child_by_field_name("attribute") {
2397                return source[attr.start_byte()..attr.end_byte()].to_string();
2398            }
2399            String::new()
2400        }
2401        _ => {
2402            // Try first identifier child
2403            let mut cursor = node.walk();
2404            for child in node.children(&mut cursor) {
2405                if child.kind() == "identifier" {
2406                    return source[child.start_byte()..child.end_byte()].to_string();
2407                }
2408            }
2409            String::new()
2410        }
2411    }
2412}
2413
2414/// Scan a Python function body for `x = ClassName(...)` assignment patterns.
2415///
2416/// Returns a map from local variable name to constructor type.
2417/// E.g., `x = Foo()` → `{"x": "Foo"}`.
2418/// Also handles `x = module.ClassName(...)` → `{"x": "ClassName"}`.
2419fn extract_python_assignment_types(
2420    source: &str,
2421    fn_node: tree_sitter::Node<'_>,
2422) -> HashMap<String, String> {
2423    let mut bindings: HashMap<String, String> = HashMap::new();
2424    let Some(body) = fn_node.child_by_field_name("body") else {
2425        return bindings;
2426    };
2427
2428    let mut stack = vec![body];
2429    while let Some(n) = stack.pop() {
2430        if n.kind() == "assignment" {
2431            // assignment: left = right
2432            // We want: left is a simple identifier, right is a call whose
2433            // function is an identifier starting with an uppercase letter
2434            // (Python convention for class names).
2435            let left = n.child_by_field_name("left");
2436            let right = n.child_by_field_name("right");
2437            if let (Some(lhs), Some(rhs)) = (left, right)
2438                && lhs.kind() == "identifier"
2439                && rhs.kind() == "call"
2440                && let Some(func) = rhs.child_by_field_name("function")
2441            {
2442                let var_name = source[lhs.start_byte()..lhs.end_byte()].to_string();
2443                let constructor_type = match func.kind() {
2444                    "identifier" => {
2445                        let t = source[func.start_byte()..func.end_byte()].to_string();
2446                        // Class names are conventionally uppercase-first
2447                        if t.chars().next().is_some_and(char::is_uppercase) {
2448                            Some(t)
2449                        } else {
2450                            None
2451                        }
2452                    }
2453                    "attribute" => {
2454                        // `module.ClassName(...)` — take the `attribute` part
2455                        func.child_by_field_name("attribute")
2456                            .map(|a| source[a.start_byte()..a.end_byte()].to_string())
2457                    }
2458                    _ => None,
2459                };
2460                if let Some(ty) = constructor_type {
2461                    bindings.insert(var_name, ty);
2462                }
2463            }
2464        }
2465        let mut cursor = n.walk();
2466        for child in n.children(&mut cursor) {
2467            stack.push(child);
2468        }
2469    }
2470    bindings
2471}
2472
2473/// Walk a Python function body and annotate attribute-call byte offsets.
2474///
2475/// A Python method call is:
2476/// `(call function: (attribute value: <receiver> attribute: (identifier) @callee))`
2477///
2478/// The receiver is the `value` child of `attribute`. This function checks:
2479/// - `self` → use the enclosing class name (`class_ctx`).
2480/// - An identifier matching a parameter type in `param_types`.
2481/// - An identifier matching a constructor assignment in `let_types`.
2482fn annotate_python_method_calls(
2483    source: &str,
2484    fn_node: tree_sitter::Node<'_>,
2485    class_ctx: Option<&str>,
2486    param_types: &HashMap<String, String>,
2487    let_types: &HashMap<String, String>,
2488    map: &mut HashMap<u32, String>,
2489) {
2490    let mut stack = vec![fn_node];
2491    while let Some(n) = stack.pop() {
2492        if n.kind() == "call"
2493            && let Some(func) = n.child_by_field_name("function")
2494            && func.kind() == "attribute"
2495            && let (Some(recv_node), Some(attr_node)) = (
2496                func.child_by_field_name("object"),
2497                func.child_by_field_name("attribute"),
2498            )
2499        {
2500            // attribute node: object (receiver) + attribute (method name)
2501            let recv_text = source[recv_node.start_byte()..recv_node.end_byte()].to_string();
2502            let receiver_type = if recv_text == "self" || recv_text == "cls" {
2503                class_ctx.map(str::to_owned)
2504            } else if recv_node.kind() == "identifier" {
2505                param_types
2506                    .get(&recv_text)
2507                    .or_else(|| let_types.get(&recv_text))
2508                    .cloned()
2509            } else {
2510                None
2511            };
2512
2513            if let Some(ty) = receiver_type {
2514                // The `@callee` capture byte offset is the `attribute` child.
2515                #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
2516                let attr_byte = attr_node.start_byte() as u32;
2517                map.insert(attr_byte, ty);
2518            }
2519        }
2520        let mut cursor = n.walk();
2521        for child in n.children(&mut cursor) {
2522            stack.push(child);
2523        }
2524    }
2525}
2526
2527// ── Python class hierarchy (MRO) extraction ───────────────────────────
2528
2529/// Walk the Python parse tree and add class → parent-names entries to `out`.
2530///
2531/// tree-sitter-python shape:
2532/// ```text
2533/// (class_definition
2534///   name: (identifier) @child
2535///   superclasses: (argument_list
2536///     (identifier) @parent             ; bare parent: class Foo(Bar):
2537///     (attribute attribute: (identifier) @parent) ; qualified: class Foo(mod.Bar):
2538///     (keyword_argument …)             ; ignored: class Foo(Bar, metaclass=Meta):
2539///   )?
2540///   ...)
2541/// ```
2542///
2543/// Each map entry's key is a class defined in the file; the value is the
2544/// ordered list of declared parent class **names** (the trailing `attribute`
2545/// segment, so `mod.Bar` becomes `Bar`). Classes with no declared parents
2546/// still get an entry (empty `Vec`) so a downstream MRO walk can tell
2547/// "known class with no parents" from "unknown class".
2548fn extract_python_class_hierarchy_node(
2549    source: &str,
2550    root: tree_sitter::Node<'_>,
2551    out: &mut HashMap<String, Vec<String>>,
2552) {
2553    let mut stack = vec![root];
2554    while let Some(n) = stack.pop() {
2555        if n.kind() == "class_definition"
2556            && let Some(name_node) = n.child_by_field_name("name")
2557        {
2558            let class_name = source[name_node.start_byte()..name_node.end_byte()].to_string();
2559            let mut parents: Vec<String> = Vec::new();
2560            if let Some(superclasses) = n.child_by_field_name("superclasses") {
2561                // superclasses is an `argument_list`; iterate its children and
2562                // collect identifiers / attribute trailing-names. Skip
2563                // keyword_argument entries (metaclass=…, etc.) and punctuation.
2564                let mut sc = superclasses.walk();
2565                for child in superclasses.children(&mut sc) {
2566                    match child.kind() {
2567                        "identifier" => {
2568                            let t = source[child.start_byte()..child.end_byte()].to_string();
2569                            parents.push(t);
2570                        }
2571                        "attribute" => {
2572                            // module.Cls → take the trailing `attribute` segment.
2573                            if let Some(attr) = child.child_by_field_name("attribute") {
2574                                parents
2575                                    .push(source[attr.start_byte()..attr.end_byte()].to_string());
2576                            }
2577                        }
2578                        // Drop keyword_argument, "(", ")", ",", comments, etc.
2579                        _ => {}
2580                    }
2581                }
2582            }
2583            out.insert(class_name, parents);
2584        }
2585        let mut cursor = n.walk();
2586        for child in n.children(&mut cursor) {
2587            stack.push(child);
2588        }
2589    }
2590}
2591
2592/// Extract the Python `class → [parents]` map from a single source file by
2593/// parsing it with tree-sitter-python.
2594///
2595/// Returns an empty map when the source fails to parse or contains no
2596/// `class_definition` nodes. The returned map is the per-file contribution
2597/// to the global hierarchy used by [`resolve_calls_with_python_mro_pub`]
2598/// for MRO-aware receiver-type dispatch (Q1, Wave 2).
2599#[must_use]
2600pub fn extract_python_class_hierarchy(source: &str) -> HashMap<String, Vec<String>> {
2601    let mut parser = Parser::new();
2602    let lang: tree_sitter::Language = tree_sitter_python::LANGUAGE.into();
2603    if parser.set_language(&lang).is_err() {
2604        return HashMap::new();
2605    }
2606    let Some(tree) = parser.parse(source, None) else {
2607        return HashMap::new();
2608    };
2609    let mut out: HashMap<String, Vec<String>> = HashMap::new();
2610    extract_python_class_hierarchy_node(source, tree.root_node(), &mut out);
2611    out
2612}
2613
2614/// Compute the linearised MRO (Method Resolution Order) for a Python class
2615/// name using a **simplified left-first depth-first walk** of the declared
2616/// `class → [parents]` hierarchy.
2617///
2618/// Python's real MRO uses C3 linearisation, which is monotonic and respects
2619/// declaration order across the diamond inheritance shape. For ripvec's
2620/// reverse-call-graph purpose we want *any plausible ancestor* of the
2621/// receiver type — including ancestors only reachable via a mixin — so we
2622/// can resolve `self.method()` calls whose dispatch lands on an ancestor.
2623/// The simplification: pre-order DFS, left-to-right, skipping cycles via a
2624/// `visited` set.
2625///
2626/// On a non-diamond shape this matches C3 exactly. On a diamond the
2627/// simplified walk may surface an ancestor earlier than C3 would, but every
2628/// ancestor C3 would visit is still reached — and ripvec's goal is "find
2629/// the implementing def for an inherited call", not "compute the runtime
2630/// dispatch winner". Over-approximating ancestors only matters when two
2631/// ancestors define the same method, and even then the left-first order
2632/// matches C3 on the common patterns
2633/// (`class Sub(Base, Mixin)` → `Sub, Base, Mixin, <Base's ancestors>, <Mixin's ancestors>`).
2634///
2635/// The returned list excludes the start class itself. Each entry appears at
2636/// most once even when reachable through multiple parent chains.
2637fn compute_python_mro<H: std::hash::BuildHasher>(
2638    start: &str,
2639    hierarchy: &HashMap<String, Vec<String>, H>,
2640) -> Vec<String> {
2641    use std::collections::HashSet;
2642    let mut order: Vec<String> = Vec::new();
2643    let mut visited: HashSet<String> = HashSet::new();
2644    // Start with the immediate parents of `start` (the receiver-type's own
2645    // scope was already searched by Priority 2's direct match).
2646    let Some(start_parents) = hierarchy.get(start) else {
2647        return order;
2648    };
2649    // DFS stack: we push in reverse so pop yields left-first order.
2650    let mut stack: Vec<String> = start_parents.iter().rev().cloned().collect();
2651    while let Some(cls) = stack.pop() {
2652        if !visited.insert(cls.clone()) {
2653            continue;
2654        }
2655        order.push(cls.clone());
2656        if let Some(parents) = hierarchy.get(&cls) {
2657            for p in parents.iter().rev() {
2658                if !visited.contains(p) {
2659                    stack.push(p.clone());
2660                }
2661            }
2662        }
2663    }
2664    order
2665}
2666
2667// ── Go receiver-type heuristic ────────────────────────────────────────
2668
2669/// Walk the Go parse tree and fill `map` with receiver-type inference.
2670///
2671/// One heuristic case: **`recv.Method()` inside a `method_declaration`**.
2672///
2673/// Go methods have an explicit receiver parameter in their signature:
2674/// `func (r *Foo) Bar() { r.Baz() }` — `r` is bound to type `Foo`.
2675///
2676/// The Go call query captures:
2677/// `(call_expression function: (selector_expression field: (field_identifier) @callee))`
2678///
2679/// Within `selector_expression`, `operand` is the receiver expression and
2680/// `field` is the method name (the `@callee` capture).
2681///
2682/// This function also handles `self.method()` patterns for cases where code
2683/// uses `self` as a receiver name (not idiomatic Go, but it occurs).
2684fn collect_go_receiver_types(
2685    source: &str,
2686    root: tree_sitter::Node<'_>,
2687    map: &mut HashMap<u32, String>,
2688) {
2689    // Stack carries (node, receiver_binding: Option<(recv_name, recv_type)>).
2690    let mut stack: Vec<(tree_sitter::Node<'_>, Option<(String, String)>)> = vec![(root, None)];
2691
2692    while let Some((n, recv_binding)) = stack.pop() {
2693        if n.kind() == "method_declaration" {
2694            // Extract the receiver name and type from the method signature.
2695            let binding = extract_go_receiver_binding(source, n);
2696            let new_binding = binding.or_else(|| recv_binding.clone());
2697            let mut cursor = n.walk();
2698            for child in n.children(&mut cursor) {
2699                stack.push((child, new_binding.clone()));
2700            }
2701        } else {
2702            // For any call_expression whose function is a selector_expression,
2703            // check if the operand matches the active receiver binding.
2704            if n.kind() == "call_expression"
2705                && let Some(func) = n.child_by_field_name("function")
2706                && func.kind() == "selector_expression"
2707                && let (Some(operand), Some(field)) = (
2708                    func.child_by_field_name("operand"),
2709                    func.child_by_field_name("field"),
2710                )
2711            {
2712                let recv_text = source[operand.start_byte()..operand.end_byte()].to_string();
2713                let receiver_type = recv_binding.as_ref().and_then(|(recv_name, recv_ty)| {
2714                    if recv_text == *recv_name {
2715                        Some(recv_ty.clone())
2716                    } else {
2717                        None
2718                    }
2719                });
2720
2721                if let Some(ty) = receiver_type {
2722                    // The `@callee` capture byte offset is the `field` child.
2723                    #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
2724                    let field_byte = field.start_byte() as u32;
2725                    map.insert(field_byte, ty);
2726                }
2727            }
2728
2729            let mut cursor = n.walk();
2730            for child in n.children(&mut cursor) {
2731                stack.push((child, recv_binding.clone()));
2732            }
2733        }
2734    }
2735}
2736
2737/// Extract the receiver name and base type from a Go `method_declaration`.
2738///
2739/// Go method declaration shape (tree-sitter-go):
2740/// ```text
2741/// (method_declaration
2742///   receiver: (parameter_list
2743///     (parameter_declaration
2744///       name: (identifier)       ← receiver name
2745///       type: (type_identifier   ← receiver type (bare)
2746///            | pointer_type (type_identifier)) ← or *Type
2747///     )
2748///   )
2749///   name: (field_identifier)
2750///   ...
2751/// )
2752/// ```
2753///
2754/// Returns `Some((recv_name, type_name))` or `None` if the receiver is unnamed
2755/// (blank identifier `_`) or has an unrecognisable shape.
2756fn extract_go_receiver_binding(
2757    source: &str,
2758    method_node: tree_sitter::Node<'_>,
2759) -> Option<(String, String)> {
2760    let receiver_list = method_node.child_by_field_name("receiver")?;
2761    // parameter_list contains one parameter_declaration
2762    let mut cursor = receiver_list.walk();
2763    for param in receiver_list.children(&mut cursor) {
2764        if param.kind() == "parameter_declaration" {
2765            let name_node = param.child_by_field_name("name");
2766            let type_node = param.child_by_field_name("type");
2767            if let (Some(nn), Some(tn)) = (name_node, type_node) {
2768                let name = source[nn.start_byte()..nn.end_byte()].to_string();
2769                if name == "_" || name.is_empty() {
2770                    return None;
2771                }
2772                let ty = extract_go_base_type(source, tn);
2773                if !ty.is_empty() {
2774                    return Some((name, ty));
2775                }
2776            }
2777        }
2778    }
2779    None
2780}
2781
2782/// Extract the base type name from a Go type node.
2783///
2784/// For `Foo` (type_identifier) → `"Foo"`.
2785/// For `*Foo` (pointer_type → type_identifier) → `"Foo"`.
2786fn extract_go_base_type(source: &str, node: tree_sitter::Node<'_>) -> String {
2787    match node.kind() {
2788        "type_identifier" => source[node.start_byte()..node.end_byte()].to_string(),
2789        "pointer_type" => {
2790            // pointer_type has one child: the pointee type
2791            let mut cursor = node.walk();
2792            for child in node.children(&mut cursor) {
2793                if child.kind() == "type_identifier" {
2794                    return source[child.start_byte()..child.end_byte()].to_string();
2795                }
2796                let t = extract_go_base_type(source, child);
2797                if !t.is_empty() {
2798                    return t;
2799                }
2800            }
2801            String::new()
2802        }
2803        _ => {
2804            let mut cursor = node.walk();
2805            for child in node.children(&mut cursor) {
2806                if child.kind() == "type_identifier" {
2807                    return source[child.start_byte()..child.end_byte()].to_string();
2808                }
2809            }
2810            String::new()
2811        }
2812    }
2813}
2814
2815/// Enrich Go `method_declaration` definition scopes with their receiver type name.
2816///
2817/// In the generic `extract_definitions` path, `build_scope_chain` walks the
2818/// *parent* chain of the `@def` node. For Go `method_declaration`, the parent
2819/// is the file root — so the scope is always `""`.
2820///
2821/// An empty scope means `resolve_calls` Priority 2 (receiver-type matching via
2822/// `scope.contains(recv_type)`) never fires for Go methods. Cross-file calls
2823/// where the caller inferred `receiver_type = Some("Foo")` stay unresolved;
2824/// no edge is recorded; `def_callers[]` stays empty for those defs — the root
2825/// cause of the missing inverse index for Go (I#P1).
2826///
2827/// Fix: after `extract_definitions`, parse the Go source a second time to find
2828/// each `method_declaration`'s receiver type, then set the matching def's scope
2829/// to `"method_declaration {ReceiverType}"`.  This matches the pattern used by
2830/// the existing `go_resolve_receiver_method_via_signature` integration test,
2831/// which asserts that `scope.contains("Foo")` succeeds when the scope is
2832/// `"method_declaration Foo"`.
2833///
2834/// Matching is by `start_byte` (precise) so name collisions across different
2835/// receiver types are handled correctly.
2836fn enrich_go_method_def_scopes(source: &str, defs: &mut [Definition]) {
2837    let go_lang: tree_sitter::Language = tree_sitter_go::LANGUAGE.into();
2838    let mut parser = Parser::new();
2839    if parser.set_language(&go_lang).is_err() {
2840        return;
2841    }
2842    let Some(tree) = parser.parse(source, None) else {
2843        return;
2844    };
2845
2846    // Walk all top-level method_declaration nodes.
2847    let root = tree.root_node();
2848    let mut method_cursor = root.walk();
2849    for child in root.children(&mut method_cursor) {
2850        if child.kind() != "method_declaration" {
2851            continue;
2852        }
2853        let Some((_, recv_type)) = extract_go_receiver_binding(source, child) else {
2854            continue;
2855        };
2856        // Match by start_byte (precise): the @def node for method_declaration in
2857        // the Go definition query is the method_declaration node itself, so its
2858        // start_byte matches the def's start_byte recorded during extract_definitions.
2859        #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
2860        let method_start_byte = child.start_byte() as u32;
2861        for def in defs.iter_mut() {
2862            if def.kind == "method_declaration" && def.start_byte == method_start_byte {
2863                def.scope = format!("method_declaration {recv_type}");
2864                break;
2865            }
2866        }
2867    }
2868}
2869
2870/// Public wrapper for `enrich_go_method_def_scopes` — enables integration tests
2871/// to call it directly without going through the full `build_graph` pipeline.
2872pub fn enrich_go_method_def_scopes_pub(source: &str, defs: &mut [Definition]) {
2873    enrich_go_method_def_scopes(source, defs);
2874}
2875
2876/// SQL: prepend a synthetic whole-file definition whose name is the filename stem.
2877///
2878/// dbt and sqlmesh follow a filename-as-model-name convention:
2879///
2880/// - `silver_issuer_returns.sql` defines the `silver_issuer_returns` model.
2881/// - `gold_issuer_returns.sql` references the silver model by filename stem,
2882///   not by any in-source CREATE TABLE.
2883///
2884/// In sqlmesh, the in-source name is templated:
2885///
2886/// ```sql
2887/// MODEL (
2888///   name @{athena_sqlmesh_silver_schema}.issuer_returns,
2889///   ...
2890/// );
2891/// SELECT ... FROM @{athena_sqlmesh_silver_schema}.stg_issuer_returns;
2892/// ```
2893///
2894/// The `MODEL (...)` header parses as an ERROR node under tree-sitter-sequel
2895/// because `@{var}` is not standard SQL; FROM/JOIN further down the file still
2896/// extract cleanly. Without a synthetic def, `lsp_workspace_symbols(query=
2897/// "silver_issuer_returns")` returns no hits — there is no real CREATE TABLE
2898/// in the file and the model name is interpolation only.
2899///
2900/// This helper prepends a definition with:
2901/// - `name` = filename stem (e.g., `silver_issuer_returns`)
2902/// - `kind` = `"sql_file"` (maps to `LSP SymbolKind::File` in
2903///   [`languages::lsp_symbol_kind_for_node_kind`])
2904/// - byte range = the entire source `[0, source.len())`
2905/// - scope / signature / qualified_name = empty / None
2906///
2907/// The whole-file byte range is the key to FROM/JOIN attribution: when
2908/// `extract_calls` later places a CallRef from a FROM clause that is not
2909/// inside any CTE or other smaller def, the smallest-enclosing-def search
2910/// lands on this synthetic file def and the edge is recorded.
2911///
2912/// If the filename has no stem (empty / `..`), the helper is a no-op.
2913/// Idempotent: if a `sql_file` def already exists at byte 0, it is left alone.
2914pub(crate) fn enrich_sql_file_def(filename: &str, source: &str, defs: &mut Vec<Definition>) {
2915    // Idempotency: do nothing if a sql_file def is already present at byte 0.
2916    if defs
2917        .iter()
2918        .any(|d| d.kind == "sql_file" && d.start_byte == 0)
2919    {
2920        return;
2921    }
2922
2923    // Derive the filename stem (last path component, file extension stripped).
2924    let stem = std::path::Path::new(filename)
2925        .file_stem()
2926        .and_then(|s| s.to_str())
2927        .unwrap_or_default();
2928    if stem.is_empty() {
2929        return;
2930    }
2931
2932    // Count newlines so end_line is reasonable for downstream UI.
2933    let end_line_zero_based = source.bytes().filter(|&b| b == b'\n').count();
2934    #[expect(clippy::cast_possible_truncation, reason = "line counts fit in u32")]
2935    let end_line = (end_line_zero_based as u32) + 1;
2936    #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
2937    let end_byte = source.len() as u32;
2938
2939    let file_def = Definition {
2940        name: stem.to_string(),
2941        kind: "sql_file".to_string(),
2942        start_line: 1,
2943        end_line,
2944        scope: String::new(),
2945        signature: None,
2946        start_byte: 0,
2947        end_byte,
2948        calls: vec![],
2949        decorator: None,
2950        lsp_kind_hint: None,
2951    };
2952    // Prepend so it remains the outermost (largest) enclosing def at byte 0,
2953    // ensuring narrow CTE defs are still preferred for inner-FROM attribution
2954    // (the smallest-enclosing rule in `extract_calls`).
2955    defs.insert(0, file_def);
2956}
2957
2958/// Public wrapper for [`enrich_sql_file_def`] — enables integration tests
2959/// to call it directly without going through the full `build_graph` pipeline.
2960///
2961/// This is `pub` (not `pub(crate)`) because integration tests in
2962/// `crates/ripvec-core/tests/` are in a separate crate and cannot access
2963/// `pub(crate)` items.
2964pub fn enrich_sql_file_def_pub(filename: &str, source: &str, defs: &mut Vec<Definition>) {
2965    enrich_sql_file_def(filename, source, defs);
2966}
2967
2968/// Public wrapper for `extract_calls` — enables integration tests to call it
2969/// directly without going through the full `build_graph` pipeline.
2970///
2971/// This is `pub` (not `pub(crate)`) because integration tests in
2972/// `crates/ripvec-core/tests/` are in a separate crate and cannot access
2973/// `pub(crate)` items.
2974pub fn extract_calls_pub(
2975    source: &str,
2976    call_config: &languages::CallConfig,
2977    defs: &mut [Definition],
2978) {
2979    extract_calls(source, call_config, defs);
2980}
2981
2982/// Public wrapper for [`extract_definitions`] — enables integration tests in
2983/// `crates/ripvec-core/tests/` to drive Python decorator detection (C1, 4.1.1).
2984///
2985/// Returns the [`Definition`] list for `source` parsed as the language
2986/// identified by `lang_config`.
2987#[must_use]
2988pub fn extract_definitions_pub(
2989    source: &str,
2990    lang_config: &languages::LangConfig,
2991) -> Vec<Definition> {
2992    extract_definitions(source, lang_config)
2993}
2994
2995/// Build an index from definition name to list of `DefId`s.
2996#[must_use]
2997pub fn build_def_index_pub(files: &[FileNode]) -> HashMap<String, Vec<DefId>> {
2998    build_def_index(files)
2999}
3000
3001fn build_def_index(files: &[FileNode]) -> HashMap<String, Vec<DefId>> {
3002    let mut index: HashMap<String, Vec<DefId>> = HashMap::new();
3003    for (file_idx, file) in files.iter().enumerate() {
3004        for (def_idx, def) in file.defs.iter().enumerate() {
3005            #[expect(clippy::cast_possible_truncation)]
3006            let did: DefId = (file_idx as u32, def_idx as u16);
3007            index.entry(def.name.clone()).or_default().push(did);
3008        }
3009    }
3010    index
3011}
3012
3013/// Resolve call references to target definitions.
3014///
3015/// Resolution priority:
3016///
3017/// 1. **Qualified path** (`qualified_path = Some("mod_a::foo")`): filter candidates
3018///    by qualifier match (file path or scope contains the qualifier segment). Unique
3019///    match → resolve; ambiguous or no match → leave `None`.
3020/// 2. **Receiver type — direct scope match** (`receiver_type = Some("Foo")`):
3021///    for method calls, prefer candidates whose `scope` contains the receiver
3022///    type name (e.g., `"impl_item Foo"`). Among receiver-matching candidates,
3023///    further prefer those in imported files. Unique match → resolve;
3024///    ambiguous → leave `None`.
3025///    When this step finds nothing on the receiver class itself, sub-step 2b
3026///    (Python MRO walk) runs: when `Foo` has a recorded parent chain, walk
3027///    the receiver class's MRO (left-first DFS) and try the scope-match
3028///    against each ancestor's name. First ancestor with a matching candidate
3029///    wins. See [`compute_python_mro`] for the simplification rationale (it
3030///    diverges from C3 only on diamond shapes where two ancestors define the
3031///    same name).
3032/// 3. **Same file** (unqualified, no receiver): prefer definitions in the caller's
3033///    own file.
3034/// 4. **SQL suffix-match** (sql_file callers only, no exact-name match): when
3035///    the caller def has `kind = "sql_file"` and the bare `call_name` (e.g.,
3036///    `"issuer_returns"`) has no exact entry in the def index, scan all
3037///    `sql_file` defs for names ending with `_<call_name>` (e.g.,
3038///    `"silver_issuer_returns"`). This bridges dbt / sqlmesh layered schema
3039///    prefixes: `gold_issuer_returns.sql` uses `FROM @{schema}.issuer_returns`
3040///    which tree-sitter reduces to the bare name `"issuer_returns"`, while the
3041///    target def is the synthetic `sql_file` def named `"silver_issuer_returns"`.
3042///    Unique suffix-match → resolve; ambiguous (multiple layers match) or
3043///    no match → leave `None`. Non-sql_file callers are explicitly excluded.
3044/// 5. **Imported file** (unqualified, no receiver): check definitions in files this
3045///    file imports. Unique imported candidate → resolve.
3046/// 6. **Global-unique fallback**: when a bare call name maps to exactly one def in the
3047///    entire graph — regardless of file or import relationship — resolve to it.
3048///    Handles trait-method dispatch (`Trait::method` called as bare `method`) and
3049///    struct constructors referenced across non-imported module boundaries.
3050///    Only fires when Priorities 1–5 left the call unresolved and exactly one
3051///    candidate exists. Ambiguous (>1 candidates) → leave `None`.
3052/// 7. **Ambiguous or unresolved**: leave `resolved` as `None` (no silent first-wins).
3053///
3054/// Equivalent to [`resolve_calls_with_python_mro_pub`] with an empty MRO map
3055/// (Priority 2.5 is a no-op).
3056pub fn resolve_calls_pub<S: std::hash::BuildHasher>(
3057    files: &mut [FileNode],
3058    def_index: &HashMap<String, Vec<DefId>, S>,
3059) {
3060    let empty: HashMap<String, Vec<String>> = HashMap::new();
3061    resolve_calls_inner(files, def_index, &empty);
3062}
3063
3064/// Resolve call references with MRO-aware Python receiver dispatch enabled.
3065///
3066/// Identical to [`resolve_calls_pub`] except that Priority 2.5 (the MRO walk)
3067/// fires when the caller passes a non-empty `python_class_hierarchy`.
3068/// `build_graph` populates the hierarchy by parsing every Python source file
3069/// with [`extract_python_class_hierarchy`] and merging the per-file maps.
3070///
3071/// Tests that want to exercise MRO resolution without going through
3072/// `build_graph` can call this directly with a synthetic hierarchy.
3073pub fn resolve_calls_with_python_mro_pub<S, H>(
3074    files: &mut [FileNode],
3075    def_index: &HashMap<String, Vec<DefId>, S>,
3076    python_class_hierarchy: &HashMap<String, Vec<String>, H>,
3077) where
3078    S: std::hash::BuildHasher,
3079    H: std::hash::BuildHasher,
3080{
3081    resolve_calls_inner(files, def_index, python_class_hierarchy);
3082}
3083
3084fn resolve_calls<S, H>(
3085    files: &mut [FileNode],
3086    def_index: &HashMap<String, Vec<DefId>, S>,
3087    python_class_hierarchy: &HashMap<String, Vec<String>, H>,
3088) where
3089    S: std::hash::BuildHasher,
3090    H: std::hash::BuildHasher,
3091{
3092    resolve_calls_inner(files, def_index, python_class_hierarchy);
3093}
3094
3095#[expect(
3096    clippy::too_many_lines,
3097    reason = "8-priority resolution cascade (qualified path, receiver type, MRO walk, same-file, \
3098              SQL suffix-match, imported-file, global-unique, ambiguous); each priority is a \
3099              distinct decision branch and extracting helpers would require passing large shared \
3100              state across boundaries"
3101)]
3102fn resolve_calls_inner<S, H>(
3103    files: &mut [FileNode],
3104    def_index: &HashMap<String, Vec<DefId>, S>,
3105    python_class_hierarchy: &HashMap<String, Vec<String>, H>,
3106) where
3107    S: std::hash::BuildHasher,
3108    H: std::hash::BuildHasher,
3109{
3110    // Pre-compute imported file sets for each file.
3111    let imported_files: Vec<std::collections::HashSet<u32>> = files
3112        .iter()
3113        .map(|f| {
3114            f.imports
3115                .iter()
3116                .filter_map(|imp| imp.resolved_idx)
3117                .collect()
3118        })
3119        .collect();
3120
3121    for file_idx in 0..files.len() {
3122        for def_idx in 0..files[file_idx].defs.len() {
3123            for call_idx in 0..files[file_idx].defs[def_idx].calls.len() {
3124                let call_name = files[file_idx].defs[def_idx].calls[call_idx].name.clone();
3125                let qualified_path = files[file_idx].defs[def_idx].calls[call_idx]
3126                    .qualified_path
3127                    .clone();
3128                let receiver_type = files[file_idx].defs[def_idx].calls[call_idx]
3129                    .receiver_type
3130                    .clone();
3131
3132                // I#54a — HCL output-attribute resolution. When the extractor
3133                // emits `terraform_remote_state.<NAME>.outputs.<ATTR>` for a
3134                // consumer reference, bind it to the upstream module's
3135                // `output "<ATTR>" { ... }` def. The def's `name` is exactly
3136                // ATTR (per the HCL @name capture in `compile_config`), and
3137                // the def lives in some `.tf` / `.tfvars` / `.hcl` file in
3138                // the workspace. This is the layout-agnostic resolution path:
3139                // when the upstream tfstate lives in a sibling file
3140                // (`infra/foo.tf` rather than `infra/<NAME>/main.tf`), the
3141                // path-segment branch below cannot help. Unique candidate →
3142                // resolve; ambiguous (the same output name in multiple HCL
3143                // files) → leave None.
3144                if let Some(ref qpath) = qualified_path
3145                    && qpath.starts_with("terraform_remote_state.")
3146                    && qpath.contains(".outputs.")
3147                    && !call_name.is_empty()
3148                {
3149                    if let Some(candidates) = def_index.get(&call_name) {
3150                        let hcl_matches: Vec<DefId> = candidates
3151                            .iter()
3152                            .copied()
3153                            .filter(|&(f_idx, _)| {
3154                                let path = std::path::Path::new(&files[f_idx as usize].path);
3155                                path.extension().is_some_and(|ext| {
3156                                    ext.eq_ignore_ascii_case("tf")
3157                                        || ext.eq_ignore_ascii_case("tfvars")
3158                                        || ext.eq_ignore_ascii_case("hcl")
3159                                })
3160                            })
3161                            .collect();
3162                        if hcl_matches.len() == 1 {
3163                            files[file_idx].defs[def_idx].calls[call_idx].resolved =
3164                                Some(hcl_matches[0]);
3165                        }
3166                    }
3167                    continue;
3168                }
3169
3170                // HCL: dedicated resolution for `terraform_remote_state.<NAME>`
3171                // and `module.<NAME>` qualified paths. Aurora's module DAG is
3172                // expressed by these patterns; resolve to the first def in any
3173                // file under a `/<NAME>/` directory segment. This is the
3174                // module-source contract (R2 + R3, Wave 3).
3175                if let Some(ref qpath) = qualified_path
3176                    && (qpath.starts_with("terraform_remote_state.")
3177                        || qpath.starts_with("module."))
3178                {
3179                    let target = &call_name; // already the bare module label
3180                    let segment_match = format!("/{target}/");
3181                    let alt_segment_prefix = format!("{target}/"); // when path starts with target dir
3182                    let candidate = files.iter().enumerate().find_map(|(idx, f)| {
3183                        if f.path.contains(&segment_match)
3184                            || f.path.starts_with(&alt_segment_prefix)
3185                        {
3186                            // Pick the first def in the file (or skip if file
3187                            // has no defs).
3188                            if !f.defs.is_empty() {
3189                                #[expect(
3190                                    clippy::cast_possible_truncation,
3191                                    reason = "file index fits in u32"
3192                                )]
3193                                {
3194                                    return Some((idx as u32, 0u16));
3195                                }
3196                            }
3197                        }
3198                        None
3199                    });
3200                    if let Some(did) = candidate {
3201                        files[file_idx].defs[def_idx].calls[call_idx].resolved = Some(did);
3202                    }
3203                    continue;
3204                }
3205
3206                // ── Priority 1: Qualified-path resolution ────────────────
3207                //
3208                // `qualified_path` carries the full scoped path (e.g. "mod_a::foo").
3209                // We look up candidates by the bare `call_name`, then filter by
3210                // whether the file path or scope contains the qualifier prefix.
3211                if let Some(ref qpath) = qualified_path {
3212                    // Qualifier is everything before the final `::`.
3213                    let qualifier = if let Some(pos) = qpath.rfind("::") {
3214                        &qpath[..pos]
3215                    } else {
3216                        qpath.as_str()
3217                    };
3218                    let qual_segments: Vec<&str> = qualifier.split("::").collect();
3219
3220                    let Some(candidates) = def_index.get(&call_name) else {
3221                        continue;
3222                    };
3223
3224                    let matching: Vec<DefId> = candidates
3225                        .iter()
3226                        .copied()
3227                        .filter(|&(f_idx, _)| {
3228                            let file_path = &files[f_idx as usize].path;
3229                            let last_segment = qual_segments.last().copied().unwrap_or("");
3230                            let path_as_module =
3231                                file_path.trim_end_matches(".rs").replace(['/', '\\'], "::");
3232                            path_as_module.contains(last_segment)
3233                                || file_path.contains(last_segment)
3234                        })
3235                        .collect();
3236
3237                    if matching.len() == 1 {
3238                        files[file_idx].defs[def_idx].calls[call_idx].resolved = Some(matching[0]);
3239                    }
3240                    // Ambiguous or no match → leave None.
3241                    continue;
3242                }
3243
3244                // ── Priority 3.5: SQL suffix-match resolution ────────────
3245                //
3246                // dbt / sqlmesh pipelines use layered schema prefixes:
3247                // `silver_issuer_returns.sql` defines the silver-layer model.
3248                // `gold_issuer_returns.sql` references it via a FROM clause
3249                // that tree-sitter parses as `name: "issuer_returns"` (the
3250                // `name:` field-selector strips the `@{schema}.` prefix).
3251                // `def_index.get("issuer_returns")` returns None — the def is
3252                // stored under "silver_issuer_returns".
3253                //
3254                // When: (a) no exact-name candidate exists, AND (b) the
3255                // caller's enclosing def is a sql_file (whole-file synthetic
3256                // def emitted by `enrich_sql_file_def`), walk every sql_file
3257                // def in the graph and check whether its name ends with
3258                // `_<call_name>`. Unique suffix-match → resolve. Ambiguous
3259                // (e.g., both gold_ and silver_ match the same bare name) →
3260                // leave None (no silent first-wins).
3261                //
3262                // Non-sql_file callers are explicitly excluded: a Rust
3263                // function_item whose call_name happens to end with a suffix
3264                // of some sql_file def must NOT be resolved via this path.
3265                if !def_index.contains_key(&call_name)
3266                    && files[file_idx].defs[def_idx].kind == "sql_file"
3267                    && !call_name.is_empty()
3268                {
3269                    let suffix = format!("_{call_name}");
3270                    let suffix_str = suffix.as_str();
3271                    // Exclude the caller def itself from the suffix scan:
3272                    // `gold_issuer_returns` also ends with `_issuer_returns`
3273                    // but must not self-resolve.
3274                    #[expect(clippy::cast_possible_truncation, reason = "file index fits in u32")]
3275                    let caller_did: DefId = (file_idx as u32, def_idx as u16);
3276                    let suffix_matches: Vec<DefId> = files
3277                        .iter()
3278                        .enumerate()
3279                        .flat_map(|(f_idx, f)| {
3280                            f.defs.iter().enumerate().filter_map(move |(d_idx, d)| {
3281                                #[expect(
3282                                    clippy::cast_possible_truncation,
3283                                    reason = "file and def indices fit in u32/u16"
3284                                )]
3285                                let did: DefId = (f_idx as u32, d_idx as u16);
3286                                if d.kind == "sql_file"
3287                                    && d.name.ends_with(suffix_str)
3288                                    && did != caller_did
3289                                {
3290                                    Some(did)
3291                                } else {
3292                                    None
3293                                }
3294                            })
3295                        })
3296                        .collect();
3297                    if suffix_matches.len() == 1 {
3298                        files[file_idx].defs[def_idx].calls[call_idx].resolved =
3299                            Some(suffix_matches[0]);
3300                    }
3301                    // Ambiguous (>1) or no match (0) → leave None.
3302                    continue;
3303                }
3304
3305                let Some(candidates) = def_index.get(&call_name) else {
3306                    continue;
3307                };
3308
3309                // ── Priority 2: Receiver-type resolution ─────────────────
3310                //
3311                // `receiver_type = Some("Foo")` means this is a method call on a
3312                // value whose type is `Foo`. Filter candidates to those whose scope
3313                // chain contains the receiver type name.
3314                if let Some(ref rtype) = receiver_type {
3315                    // Candidates whose scope contains the receiver type name.
3316                    let receiver_matching: Vec<DefId> = candidates
3317                        .iter()
3318                        .copied()
3319                        .filter(|&(f_idx, d_idx)| {
3320                            let scope = &files[f_idx as usize].defs[d_idx as usize].scope;
3321                            scope.contains(rtype.as_str())
3322                        })
3323                        .collect();
3324
3325                    if receiver_matching.len() == 1 {
3326                        files[file_idx].defs[def_idx].calls[call_idx].resolved =
3327                            Some(receiver_matching[0]);
3328                        continue;
3329                    }
3330
3331                    if receiver_matching.len() > 1 {
3332                        // Among receiver-matching candidates, prefer those in imported files.
3333                        let imported_receiver_matching: Vec<DefId> = receiver_matching
3334                            .iter()
3335                            .copied()
3336                            .filter(|(f, _)| imported_files[file_idx].contains(f))
3337                            .collect();
3338                        if imported_receiver_matching.len() == 1 {
3339                            files[file_idx].defs[def_idx].calls[call_idx].resolved =
3340                                Some(imported_receiver_matching[0]);
3341                        }
3342                        // Ambiguous even after import filter → leave None.
3343                        continue;
3344                    }
3345
3346                    // ── Priority 2.5: Python MRO walk ─────────────────────
3347                    //
3348                    // The receiver type's own scope has no matching def — but
3349                    // the method may live on an ancestor class. Walk the MRO
3350                    // (left-first DFS) and try the scope-match against each
3351                    // ancestor's name. First ancestor with at least one
3352                    // scope-matching candidate wins; if multiple candidates
3353                    // match for the same ancestor, prefer imported files,
3354                    // else take the first in stable order.
3355                    //
3356                    // Liskov: a subclass's `self.method()` call must dispatch
3357                    // through the MRO; over-approximating ancestors is the
3358                    // correct conservative move for a reverse call graph.
3359                    // For non-Python languages or Python receivers with no
3360                    // recorded parents, `compute_python_mro` returns an
3361                    // empty vector and this loop is a no-op.
3362                    let mro = compute_python_mro(rtype, python_class_hierarchy);
3363                    let mut resolved_via_mro: Option<DefId> = None;
3364                    for ancestor in &mro {
3365                        let ancestor_matching: Vec<DefId> = candidates
3366                            .iter()
3367                            .copied()
3368                            .filter(|&(f_idx, d_idx)| {
3369                                let scope = &files[f_idx as usize].defs[d_idx as usize].scope;
3370                                scope.contains(ancestor.as_str())
3371                            })
3372                            .collect();
3373                        if ancestor_matching.len() == 1 {
3374                            resolved_via_mro = Some(ancestor_matching[0]);
3375                            break;
3376                        }
3377                        if ancestor_matching.len() > 1 {
3378                            // Prefer imported files among the ancestor matches.
3379                            let imported_ancestor: Vec<DefId> = ancestor_matching
3380                                .iter()
3381                                .copied()
3382                                .filter(|(f, _)| imported_files[file_idx].contains(f))
3383                                .collect();
3384                            if imported_ancestor.len() == 1 {
3385                                resolved_via_mro = Some(imported_ancestor[0]);
3386                                break;
3387                            }
3388                            // Ambiguous at this ancestor — pick the first
3389                            // candidate in stable order. The MRO walk's job
3390                            // is to find *an* implementing def for an
3391                            // inherited call, not to compute the runtime
3392                            // dispatch winner; any plausible candidate is
3393                            // useful for the reverse call graph.
3394                            resolved_via_mro = Some(ancestor_matching[0]);
3395                            break;
3396                        }
3397                    }
3398                    if let Some(did) = resolved_via_mro {
3399                        files[file_idx].defs[def_idx].calls[call_idx].resolved = Some(did);
3400                        continue;
3401                    }
3402                    // No receiver-matching candidates anywhere in the MRO →
3403                    // fall through to bare-name resolution.
3404                }
3405
3406                // ── Priority 3: Same-file resolution ─────────────────────
3407                #[expect(clippy::cast_possible_truncation)]
3408                let file_idx_u32 = file_idx as u32;
3409                if let Some(&did) = candidates.iter().find(|(f, _)| *f == file_idx_u32) {
3410                    files[file_idx].defs[def_idx].calls[call_idx].resolved = Some(did);
3411                    continue;
3412                }
3413
3414                // ── Priority 4: Imported-file resolution ──────────────────
3415                let imported_candidates: Vec<DefId> = candidates
3416                    .iter()
3417                    .copied()
3418                    .filter(|(f, _)| imported_files[file_idx].contains(f))
3419                    .collect();
3420                if imported_candidates.len() == 1 {
3421                    files[file_idx].defs[def_idx].calls[call_idx].resolved =
3422                        Some(imported_candidates[0]);
3423                }
3424
3425                // ── Priority 5.5 (I#54b): SQL file-level resolution ──────
3426                //
3427                // When the caller's enclosing def is a `sql_file` (the
3428                // synthetic whole-file def emitted by `enrich_sql_file_def`)
3429                // and at least one candidate is also a `sql_file` in a
3430                // different file, prefer that candidate. dbt/sqlmesh models
3431                // are named by filename; a `FROM upstream_table` reference
3432                // means "the model defined in `upstream_table.sql`", which
3433                // is exactly the sql_file def named `upstream_table`.
3434                //
3435                // Without this branch the bare-name lookup is ambiguous on
3436                // the global-unique step whenever the upstream file also
3437                // contains a `CREATE TABLE upstream_table` (two co-located
3438                // candidates: `create_table` and `sql_file` — same logical
3439                // entity but two distinct `Definition` records). The
3440                // sql_file is the correct target for cross-file edges
3441                // because it carries `calls[]` for further chaining.
3442                //
3443                // Only fires for sql_file callers — Rust / Python /
3444                // JavaScript bare-name calls that happen to match a
3445                // sql_file def must NOT be resolved through this path.
3446                if files[file_idx].defs[def_idx].kind == "sql_file"
3447                    && files[file_idx].defs[def_idx].calls[call_idx]
3448                        .resolved
3449                        .is_none()
3450                {
3451                    #[expect(clippy::cast_possible_truncation, reason = "file index fits in u32")]
3452                    let caller_file_u32 = file_idx as u32;
3453                    let sql_file_targets: Vec<DefId> = candidates
3454                        .iter()
3455                        .copied()
3456                        .filter(|&(f_idx, d_idx)| {
3457                            f_idx != caller_file_u32
3458                                && files[f_idx as usize].defs[d_idx as usize].kind == "sql_file"
3459                        })
3460                        .collect();
3461                    if sql_file_targets.len() == 1 {
3462                        files[file_idx].defs[def_idx].calls[call_idx].resolved =
3463                            Some(sql_file_targets[0]);
3464                        continue;
3465                    }
3466                }
3467
3468                // ── Priority 6: Global-unique fallback ────────────────────────────
3469                //
3470                // When a bare call name maps to exactly one def in the entire graph
3471                // — regardless of file or import relationship — resolve to it. This
3472                // bridges trait-method dispatch (`Trait::method` called as bare
3473                // `method`) and constructors referenced across non-imported module
3474                // boundaries.
3475                //
3476                // Guard with `.is_none()` so we do not overwrite a Priority 5
3477                // (imported-file) resolution that already fired.
3478                if candidates.len() == 1
3479                    && files[file_idx].defs[def_idx].calls[call_idx]
3480                        .resolved
3481                        .is_none()
3482                {
3483                    files[file_idx].defs[def_idx].calls[call_idx].resolved = Some(candidates[0]);
3484                }
3485                // Priority 7: Ambiguous or unresolved → leave None.
3486            }
3487        }
3488    }
3489}
3490
3491/// Compute a prefix-sum offset table for flattening `DefId`s to linear indices.
3492fn def_offsets(files: &[FileNode]) -> Vec<usize> {
3493    let mut offsets = Vec::with_capacity(files.len() + 1);
3494    offsets.push(0);
3495    for file in files {
3496        offsets.push(offsets.last().unwrap() + file.defs.len());
3497    }
3498    offsets
3499}
3500
3501/// Flatten a `DefId` to a linear index using the offset table.
3502fn flatten_def_id(offsets: &[usize], did: DefId) -> usize {
3503    offsets[did.0 as usize] + did.1 as usize
3504}
3505
3506/// Build top-N caller and callee lists for each definition (flattened).
3507fn build_def_neighbor_lists(
3508    n: usize,
3509    edges: &[(u32, u32, u32)],
3510    offsets: &[usize],
3511) -> (Vec<Vec<DefId>>, Vec<Vec<DefId>>) {
3512    let mut incoming: Vec<Vec<(u32, u32)>> = vec![vec![]; n];
3513    let mut outgoing: Vec<Vec<(u32, u32)>> = vec![vec![]; n];
3514
3515    for &(src, dst, w) in edges {
3516        let (s, d) = (src as usize, dst as usize);
3517        if s < n && d < n {
3518            incoming[d].push((src, w));
3519            outgoing[s].push((dst, w));
3520        }
3521    }
3522
3523    // Convert flat index back to DefId
3524    let to_def_id = |flat: u32| -> DefId {
3525        let flat_usize = flat as usize;
3526        let file_idx = offsets.partition_point(|&o| o <= flat_usize) - 1;
3527        let def_idx = flat_usize - offsets[file_idx];
3528        #[expect(clippy::cast_possible_truncation)]
3529        (file_idx as u32, def_idx as u16)
3530    };
3531
3532    let callers = incoming
3533        .into_iter()
3534        .map(|mut v| {
3535            v.sort_by_key(|b| std::cmp::Reverse(b.1));
3536            v.truncate(MAX_NEIGHBORS);
3537            v.into_iter().map(|(idx, _)| to_def_id(idx)).collect()
3538        })
3539        .collect();
3540
3541    let callees = outgoing
3542        .into_iter()
3543        .map(|mut v| {
3544            v.sort_by_key(|b| std::cmp::Reverse(b.1));
3545            v.truncate(MAX_NEIGHBORS);
3546            v.into_iter().map(|(idx, _)| to_def_id(idx)).collect()
3547        })
3548        .collect();
3549
3550    (callers, callees)
3551}
3552
3553// ── PageRank ─────────────────────────────────────────────────────────
3554
3555/// Compute `PageRank` scores for a graph.
3556///
3557/// If `focus` is `Some(idx)`, computes topic-sensitive `PageRank` biased
3558/// toward file `idx`. Otherwise computes standard (uniform) `PageRank`.
3559///
3560/// Returns one score per node, summing to 1.0.
3561#[expect(
3562    clippy::cast_precision_loss,
3563    reason = "node count fits comfortably in f32"
3564)]
3565fn pagerank(n: usize, edges: &[(u32, u32, u32)], focus: Option<usize>) -> Vec<f32> {
3566    if n == 0 {
3567        return vec![];
3568    }
3569
3570    // Build adjacency: out_edges[src] = [(dst, weight)]
3571    let mut out_edges: Vec<Vec<(usize, f32)>> = vec![vec![]; n];
3572    let mut out_weight: Vec<f32> = vec![0.0; n];
3573
3574    for &(src, dst, w) in edges {
3575        let (s, d) = (src as usize, dst as usize);
3576        if s < n && d < n {
3577            #[expect(clippy::cast_possible_truncation, reason = "edge weights are small")]
3578            let wf = f64::from(w) as f32;
3579            out_edges[s].push((d, wf));
3580            out_weight[s] += wf;
3581        }
3582    }
3583
3584    // Personalization vector (Haveliwala 2002, topic-sensitive PageRank).
3585    //
3586    // When a focus file is specified, the teleportation distribution is split:
3587    //   - PERSONALIZATION_ALPHA (0.15) concentrated on the focus node.
3588    //   - (1 - PERSONALIZATION_ALPHA) = 0.85 spread uniformly over the
3589    //     remaining (n - 1) other nodes.
3590    //
3591    // This gives the focus file a gentle bias over its neighbors without
3592    // collapsing every other file to an equal uniform floor. The resulting
3593    // ranks still vary across the corpus, so the caller sees a *neighborhood*
3594    // of semantically related files rebiased toward the focus (I#16 fix).
3595    //
3596    // For n == 1 there are no other nodes; the focus gets all mass (= 1.0).
3597    let bias: Vec<f32> = if let Some(idx) = focus {
3598        if n == 1 {
3599            vec![1.0_f32]
3600        } else {
3601            let other_mass = (1.0_f32 - PERSONALIZATION_ALPHA) / (n as f32 - 1.0);
3602            let mut b = vec![other_mass; n];
3603            if idx < n {
3604                b[idx] = PERSONALIZATION_ALPHA;
3605            }
3606            // Verify sum ≈ 1.0 (should hold by construction; normalization
3607            // guards against floating-point drift on very large graphs).
3608            let sum: f32 = b.iter().sum();
3609            for v in &mut b {
3610                *v /= sum;
3611            }
3612            b
3613        }
3614    } else {
3615        vec![1.0 / n as f32; n]
3616    };
3617
3618    let mut rank = vec![1.0 / n as f32; n];
3619    let mut next_rank = vec![0.0_f32; n];
3620
3621    for _ in 0..MAX_ITERATIONS {
3622        // Collect dangling mass (nodes with no outgoing edges)
3623        let dangling: f32 = rank
3624            .iter()
3625            .enumerate()
3626            .filter(|&(i, _)| out_edges[i].is_empty())
3627            .map(|(_, &r)| r)
3628            .sum();
3629
3630        // Distribute rank
3631        for (i, nr) in next_rank.iter_mut().enumerate() {
3632            *nr = (1.0 - DAMPING).mul_add(bias[i], DAMPING * dangling * bias[i]);
3633        }
3634
3635        for (src, edges_list) in out_edges.iter().enumerate() {
3636            if edges_list.is_empty() {
3637                continue;
3638            }
3639            let src_rank = rank[src];
3640            let total_w = out_weight[src];
3641            for &(dst, w) in edges_list {
3642                next_rank[dst] += DAMPING * src_rank * (w / total_w);
3643            }
3644        }
3645
3646        // Check convergence
3647        let diff: f32 = rank
3648            .iter()
3649            .zip(next_rank.iter())
3650            .map(|(a, b)| (a - b).abs())
3651            .sum();
3652
3653        std::mem::swap(&mut rank, &mut next_rank);
3654
3655        if diff < EPSILON {
3656            break;
3657        }
3658    }
3659
3660    rank
3661}
3662
3663// ── Graph Building ───────────────────────────────────────────────────
3664
3665/// Intermediate result from definition-level graph computation.
3666struct DefGraphData {
3667    def_edges: Vec<(DefId, DefId, u32)>,
3668    def_ranks: Vec<f32>,
3669    def_callers: Vec<Vec<DefId>>,
3670    def_callees: Vec<Vec<DefId>>,
3671    offsets: Vec<usize>,
3672    base_ranks: Vec<f32>,
3673    file_edges: Vec<(u32, u32, u32)>,
3674}
3675
3676/// Build bidirectional trait↔impl method edges for PageRank propagation (G3).
3677///
3678/// For every impl method that overrides a trait method, adds:
3679/// - `(impl_def_id, trait_def_id, 1)` — impl → trait
3680/// - `(trait_def_id, impl_def_id, 1)` — trait → impl
3681///
3682/// Detection heuristic: an impl method "overrides" a trait method when:
3683/// - The impl method's kind is `"function_item"` and its `scope` starts with
3684///   `"impl_item"`.
3685/// - The trait method's kind is `"function_signature_item"` and its `scope`
3686///   starts with `"trait_item"`.
3687/// - Both have the same `name`.
3688/// - The impl's file imports the trait's file (or they share a file).
3689///
3690/// This is heuristic, not sound: it may produce false positives when two
3691/// unrelated traits define methods with the same name. The practical false-
3692/// positive rate on real Rust codebases is low because method names are
3693/// usually unique within a crate.
3694#[must_use]
3695pub fn build_trait_impl_edges_pub(files: &[FileNode]) -> Vec<(DefId, DefId, u32)> {
3696    build_trait_impl_edges(files)
3697}
3698
3699fn build_trait_impl_edges(files: &[FileNode]) -> Vec<(DefId, DefId, u32)> {
3700    // Build index: method_name → list of (DefId, is_trait_method).
3701    // trait method: kind == "function_signature_item" (abstract) OR scope contains "trait_item".
3702    // impl method:  kind == "function_item" AND scope contains "impl_item".
3703    let mut trait_methods: HashMap<String, Vec<DefId>> = HashMap::new();
3704    let mut impl_methods: HashMap<String, Vec<DefId>> = HashMap::new();
3705
3706    for (fi, file) in files.iter().enumerate() {
3707        for (di, def) in file.defs.iter().enumerate() {
3708            #[expect(clippy::cast_possible_truncation)]
3709            let did: DefId = (fi as u32, di as u16);
3710            if def.kind == "function_signature_item"
3711                || (def.scope.starts_with("trait_item") && def.kind == "function_item")
3712            {
3713                trait_methods.entry(def.name.clone()).or_default().push(did);
3714            } else if def.kind == "function_item" && def.scope.starts_with("impl_item") {
3715                impl_methods.entry(def.name.clone()).or_default().push(did);
3716            }
3717        }
3718    }
3719
3720    // Pre-build imported-files sets to restrict matching.
3721    let imported_sets: Vec<std::collections::HashSet<u32>> = files
3722        .iter()
3723        .map(|f| {
3724            f.imports
3725                .iter()
3726                .filter_map(|imp| imp.resolved_idx)
3727                .collect()
3728        })
3729        .collect();
3730
3731    let mut edges: Vec<(DefId, DefId, u32)> = Vec::new();
3732
3733    for (name, trait_defs) in &trait_methods {
3734        let Some(impl_defs) = impl_methods.get(name) else {
3735            continue;
3736        };
3737        for &(tf, td) in trait_defs {
3738            for &(imf, imd) in impl_defs {
3739                // The impl file must import the trait file (or be the same file).
3740                let connected = tf == imf
3741                    || imported_sets
3742                        .get(imf as usize)
3743                        .is_some_and(|s| s.contains(&tf));
3744                if connected {
3745                    let trait_id: DefId = (tf, td);
3746                    let impl_id: DefId = (imf, imd);
3747                    edges.push((trait_id, impl_id, 1));
3748                    edges.push((impl_id, trait_id, 1));
3749                }
3750            }
3751        }
3752    }
3753
3754    edges
3755}
3756
3757/// Build definition-level edges, compute `PageRank`, and derive file-level data.
3758fn compute_def_graph(files: &[FileNode]) -> DefGraphData {
3759    // Build definition-level edge list from resolved calls
3760    let mut def_edge_map: HashMap<(DefId, DefId), u32> = HashMap::new();
3761    for (file_idx, file) in files.iter().enumerate() {
3762        for (def_idx, def) in file.defs.iter().enumerate() {
3763            #[expect(clippy::cast_possible_truncation)]
3764            let caller_id: DefId = (file_idx as u32, def_idx as u16);
3765            for call in &def.calls {
3766                if let Some(callee_id) = call.resolved {
3767                    *def_edge_map.entry((caller_id, callee_id)).or_insert(0) += 1;
3768                }
3769            }
3770        }
3771    }
3772
3773    // Add trait↔impl bidirectional edges (G3).
3774    let trait_impl_edges = build_trait_impl_edges(files);
3775    for (src, dst, w) in trait_impl_edges {
3776        *def_edge_map.entry((src, dst)).or_insert(0) += w;
3777    }
3778
3779    let def_edges: Vec<(DefId, DefId, u32)> = def_edge_map
3780        .into_iter()
3781        .map(|((src, dst), w)| (src, dst, w))
3782        .collect();
3783
3784    // Compute def-level PageRank
3785    let offsets = def_offsets(files);
3786    let n_defs = *offsets.last().unwrap_or(&0);
3787
3788    let flat_def_edges: Vec<(u32, u32, u32)> = def_edges
3789        .iter()
3790        .map(|(src, dst, w)| {
3791            #[expect(clippy::cast_possible_truncation)]
3792            (
3793                flatten_def_id(&offsets, *src) as u32,
3794                flatten_def_id(&offsets, *dst) as u32,
3795                *w,
3796            )
3797        })
3798        .collect();
3799
3800    let def_ranks = pagerank(n_defs, &flat_def_edges, None);
3801
3802    // Derive file-level edges from def-level call edges. A cross-file def→def
3803    // edge contributes one file→file edge in the same direction; intra-file
3804    // edges are skipped (they cannot move rank between files).
3805    let mut file_edge_map: HashMap<(u32, u32), u32> = HashMap::new();
3806    for &(src, dst, w) in &def_edges {
3807        let src_file = src.0;
3808        let dst_file = dst.0;
3809        if src_file != dst_file {
3810            *file_edge_map.entry((src_file, dst_file)).or_insert(0) += w;
3811        }
3812    }
3813    let file_edges: Vec<(u32, u32, u32)> = file_edge_map
3814        .into_iter()
3815        .map(|((src, dst), w)| (src, dst, w))
3816        .collect();
3817
3818    // File-level rank: run PageRank directly on the file-level edge graph
3819    // (B-0025 fix, Cycle 11). The earlier formulation aggregated file rank
3820    // by summing per-def ranks of every def in the file, which amplified
3821    // the teleportation floor for files with high def cardinality but zero
3822    // in-degree (test files in particular). Running PageRank on
3823    // `file_edges` makes file rank obey the user-visible invariant: rank
3824    // flows from caller-file to callee-file, never from callee to caller.
3825    //
3826    // A file with no incoming file→file edges receives only the
3827    // teleportation floor `(1 - DAMPING) / n_files`, regardless of how
3828    // many defs it contains. Test files (which call hubs but are called
3829    // by no one) now correctly drop to the floor.
3830    let n_files = files.len();
3831    let base_ranks: Vec<f32> = if n_files == 0 {
3832        Vec::new()
3833    } else {
3834        pagerank(n_files, &file_edges, None)
3835    };
3836
3837    // Build def-level caller/callee lists
3838    let (def_callers, def_callees) = build_def_neighbor_lists(n_defs, &flat_def_edges, &offsets);
3839
3840    DefGraphData {
3841        def_edges,
3842        def_ranks,
3843        def_callers,
3844        def_callees,
3845        offsets,
3846        base_ranks,
3847        file_edges,
3848    }
3849}
3850
3851/// Build a dependency graph from a repository root.
3852///
3853/// Walks the directory tree, parses each supported file with tree-sitter,
3854/// extracts definitions and imports, resolves import paths to files, runs
3855/// `PageRank`, and builds caller/callee lists.
3856///
3857/// # Errors
3858///
3859/// Returns an error if file walking or reading fails.
3860#[expect(
3861    clippy::too_many_lines,
3862    reason = "three-phase parallel pipeline (walk+filter, def+import extraction, call extraction) \
3863              plus resolve + graph build; phases share state (file_index, raw_sources) and \
3864              cannot be meaningfully split without passing large mutable structures across \
3865              boundaries with no clarity gain"
3866)]
3867pub fn build_graph(root: &Path) -> crate::Result<RepoGraph> {
3868    let root = root.canonicalize().map_err(|e| crate::Error::Io {
3869        path: root.display().to_string(),
3870        source: e,
3871    })?;
3872
3873    let mut walk_options = walk::WalkOptions::default();
3874    if let Some((_, config)) = crate::cache::config::find_config(&root) {
3875        walk_options.ignore_patterns = config.ignore.patterns;
3876    }
3877    let all_files = walk::collect_files_with_options(&root, &walk_options);
3878
3879    // Phase 1: parallel filter + read. For each candidate path with a
3880    // supported extension, read its source from disk and emit a tuple
3881    // alongside its relative path. rayon spreads the I/O cost across
3882    // worker threads; on a 1M-file corpus this was ~20s sequential and
3883    // now sits in the 2-3s range bounded by disk + filter throughput.
3884    let raw_inputs: Vec<(PathBuf, String, String, String)> = all_files
3885        .par_iter()
3886        .filter_map(|path| {
3887            let ext = path
3888                .extension()
3889                .and_then(|e| e.to_str())
3890                .unwrap_or_default()
3891                .to_string();
3892            if languages::config_for_extension(&ext).is_none()
3893                && import_query_for_extension(&ext).is_none()
3894            {
3895                return None;
3896            }
3897            let source = std::fs::read_to_string(path).ok()?;
3898            let rel_path = path
3899                .strip_prefix(&root)
3900                .unwrap_or(path)
3901                .display()
3902                .to_string();
3903            Some((path.clone(), rel_path, ext, source))
3904        })
3905        .collect();
3906
3907    // Build the contiguous `files` Vec and the absolute-path -> idx
3908    // lookup. Sequential because both want stable indices that match
3909    // `raw_sources`'s order; the per-file work this gates is trivial.
3910    let mut file_index: HashMap<PathBuf, usize> = HashMap::with_capacity(raw_inputs.len());
3911    let mut files: Vec<FileNode> = Vec::with_capacity(raw_inputs.len());
3912    let mut raw_sources: Vec<(usize, String, String)> = Vec::with_capacity(raw_inputs.len());
3913    for (idx, (abs_path, rel_path, ext, source)) in raw_inputs.into_iter().enumerate() {
3914        file_index.insert(abs_path, idx);
3915        files.push(FileNode {
3916            path: rel_path,
3917            defs: vec![],
3918            imports: vec![],
3919        });
3920        raw_sources.push((idx, ext, source));
3921    }
3922
3923    // Phase 2: parallel per-file definition + import extraction. Each
3924    // file's tree-sitter parse + def/import queries are independent;
3925    // par_iter_mut over files.iter_mut().zip(raw_sources.par_iter())
3926    // lets every rayon worker grind its own slice. The closures here
3927    // borrow `&root` and `&file_index` immutably (both Sync) and write
3928    // disjoint `FileNode` slots via the &mut iterator.
3929    files
3930        .par_iter_mut()
3931        .zip(raw_sources.par_iter())
3932        .for_each(|(file, (_, ext, source))| {
3933            if let Some(config) = languages::config_for_extension(ext) {
3934                file.defs = extract_definitions(source, &config);
3935                // Go method_declaration scopes are empty after the generic
3936                // extract_definitions pass (the method is a top-level node
3937                // with no structural parent in CONTAINER_KINDS). Enrich them
3938                // with the receiver type so that resolve_calls Priority 2
3939                // (scope.contains(recv_type)) fires correctly for cross-file
3940                // Go receiver-method calls. This populates def_callers[] for
3941                // Go in compute_def_graph (P1 fix).
3942                if languages::is_go_language(&config.language) {
3943                    enrich_go_method_def_scopes(source, &mut file.defs);
3944                }
3945                // SQL: prepend a synthetic file-level def named after the
3946                // filename stem (dbt/sqlmesh convention). The whole-file
3947                // byte range becomes the smallest-enclosing fallback for
3948                // FROM/JOIN call-edges that are not inside any CTE, which
3949                // is the resolution target for cross-model references
3950                // (S1, Wave 4). file.path is relative to the repo root and
3951                // is what file_stem() needs to derive the model name.
3952                if languages::is_sql_language(&config.language) {
3953                    enrich_sql_file_def(&file.path, source, &mut file.defs);
3954                }
3955            }
3956            if let Some((lang, import_query)) = import_query_for_extension(ext) {
3957                let raw_imports = extract_imports(source, &lang, &import_query);
3958                let file_path = root.join(&file.path);
3959                file.imports = raw_imports
3960                    .into_iter()
3961                    .map(|raw| {
3962                        let resolved_idx =
3963                            resolve_import(&raw, ext, &file_path, &root, &file_index)
3964                                .and_then(|i| u32::try_from(i).ok());
3965                        ImportRef {
3966                            raw_path: raw,
3967                            resolved_idx,
3968                        }
3969                    })
3970                    .collect();
3971            }
3972        });
3973
3974    // Phase 3: parallel per-file call extraction. Mutates each
3975    // FileNode's `defs[*].calls` independently. Aligned with
3976    // raw_sources by index via the zip.
3977    files
3978        .par_iter_mut()
3979        .zip(raw_sources.par_iter())
3980        .for_each(|(file, (_, ext, source))| {
3981            if let Some(call_config) = languages::call_query_for_extension(ext) {
3982                extract_calls(source, &call_config, &mut file.defs);
3983            }
3984        });
3985
3986    // Build the Python class hierarchy (class_name → parent class names) by
3987    // walking every Python source file. The map is used by `resolve_calls`
3988    // Priority 2.5 to dispatch `self.method()` calls through the MRO when
3989    // the method lives on a parent / mixin class (Q1, Wave 2).
3990    //
3991    // Parallel: extract_python_class_hierarchy is pure per-file, then we
3992    // fold the per-file maps into one global map sequentially because
3993    // HashMap is not lock-free. On a 1k-Python-file corpus this fold takes
3994    // <10ms — much smaller than the parallel parse work that feeds it.
3995    let python_hierarchies: Vec<HashMap<String, Vec<String>>> = raw_sources
3996        .par_iter()
3997        .map(|(_, ext, source)| {
3998            if ext == "py" || ext == "pyi" {
3999                extract_python_class_hierarchy(source)
4000            } else {
4001                HashMap::new()
4002            }
4003        })
4004        .collect();
4005    let mut python_class_hierarchy: HashMap<String, Vec<String>> = HashMap::new();
4006    for local in python_hierarchies {
4007        for (k, v) in local {
4008            // First declaration wins on name collisions across files. The
4009            // MRO walk only needs a plausible parent chain to find an
4010            // ancestor's methods; this is conservative but acceptable.
4011            python_class_hierarchy.entry(k).or_insert(v);
4012        }
4013    }
4014
4015    // Resolve call references to target definitions
4016    let def_index = build_def_index(&files);
4017    resolve_calls(&mut files, &def_index, &python_class_hierarchy);
4018
4019    // Build def-level graph, compute PageRank, and derive file-level data
4020    let graph_data = compute_def_graph(&files);
4021
4022    // Build file-level caller/callee lists
4023    let n = files.len();
4024    let (callers, callees) = build_neighbor_lists(n, &graph_data.file_edges);
4025
4026    // Auto-tune alpha based on graph density
4027    #[expect(clippy::cast_precision_loss, reason = "graph sizes fit in f32")]
4028    let density = if n > 1 {
4029        graph_data.file_edges.len() as f32 / (n as f32 * (n as f32 - 1.0))
4030    } else {
4031        0.0
4032    };
4033    let alpha = 0.3f32.mul_add(density.min(1.0), 0.5);
4034
4035    Ok(RepoGraph {
4036        files,
4037        edges: graph_data.file_edges,
4038        base_ranks: graph_data.base_ranks,
4039        callers,
4040        callees,
4041        def_edges: graph_data.def_edges,
4042        def_ranks: graph_data.def_ranks,
4043        def_callers: graph_data.def_callers,
4044        def_callees: graph_data.def_callees,
4045        def_offsets: graph_data.offsets,
4046        alpha,
4047    })
4048}
4049
4050/// Build a `RepoGraph` directly from a pre-constructed `Vec<FileNode>`.
4051///
4052/// Skips the filesystem walk phase of [`build_graph`]; useful for integration
4053/// tests that want to build synthetic graphs without touching disk.
4054///
4055/// Resolves calls, builds the def-level graph (including G3 trait↔impl edges),
4056/// computes `PageRank`, and builds caller/callee lists.
4057#[must_use]
4058pub fn build_graph_from_files_pub(files: Vec<FileNode>) -> RepoGraph {
4059    let empty_hierarchy: HashMap<String, Vec<String>> = HashMap::new();
4060    build_graph_from_files_with_mro_pub(files, &empty_hierarchy)
4061}
4062
4063/// Build a `RepoGraph` from synthetic files with Python MRO resolution enabled.
4064///
4065/// Identical to [`build_graph_from_files_pub`] except that the resolver's
4066/// Priority 2.5 (MRO walk) fires against the caller-supplied
4067/// `python_class_hierarchy` map. Use this when an integration test needs to
4068/// exercise `self.method()` dispatch through inherited classes — including
4069/// the inverse-edge propagation into [`RepoGraph::def_callers`] that
4070/// `lsp_incoming_calls` consumes.
4071///
4072/// I#58 / 4.1.3: when a `SubScreen.method_caller` calls `self.method()` and
4073/// the MRO walk binds the call to `Mixin.method`, the forward edge
4074/// `(SubScreen.method_caller, Mixin.method)` is recorded in
4075/// [`DefGraphData::def_edges`]. [`build_def_neighbor_lists`] then derives
4076/// symmetric inverse-edge entries: `def_callers[Mixin.method]` includes
4077/// `SubScreen.method_caller`. This is the property `lsp_incoming_calls` reads.
4078#[must_use]
4079pub fn build_graph_from_files_with_mro_pub<H>(
4080    mut files: Vec<FileNode>,
4081    python_class_hierarchy: &HashMap<String, Vec<String>, H>,
4082) -> RepoGraph
4083where
4084    H: std::hash::BuildHasher,
4085{
4086    let def_index = build_def_index(&files);
4087    resolve_calls(&mut files, &def_index, python_class_hierarchy);
4088    let graph_data = compute_def_graph(&files);
4089    let n = files.len();
4090    let (callers, callees) = build_neighbor_lists(n, &graph_data.file_edges);
4091
4092    #[expect(clippy::cast_precision_loss, reason = "graph sizes fit in f32")]
4093    let density = if n > 1 {
4094        graph_data.file_edges.len() as f32 / (n as f32 * (n as f32 - 1.0))
4095    } else {
4096        0.0
4097    };
4098    let alpha = 0.3f32.mul_add(density.min(1.0), 0.5);
4099
4100    RepoGraph {
4101        files,
4102        edges: graph_data.file_edges,
4103        base_ranks: graph_data.base_ranks,
4104        callers,
4105        callees,
4106        def_edges: graph_data.def_edges,
4107        def_ranks: graph_data.def_ranks,
4108        def_callers: graph_data.def_callers,
4109        def_callees: graph_data.def_callees,
4110        def_offsets: graph_data.offsets,
4111        alpha,
4112    }
4113}
4114
4115// ── Dead-code analysis ───────────────────────────────────────────────────────
4116
4117/// Global flat definition index: `def_offsets[file_idx] + def_idx_within_file`.
4118///
4119/// This is the natural index into [`RepoGraph::def_ranks`],
4120/// [`RepoGraph::def_callers`], and [`RepoGraph::def_callees`].
4121pub type DefIndex = usize;
4122
4123/// A connected component in the dead-code subgraph.
4124///
4125/// All members are definitions that are unreachable from any entry point.
4126/// The component is formed by treating `def_callees + def_callers` edges as
4127/// undirected — so a transitively-dead group (including mutual-recursion
4128/// cycles that are collectively unreachable) surfaces as one cluster.
4129#[derive(Debug, Clone)]
4130pub struct DeadCluster {
4131    /// Global flat def index of the cluster root (the member with the
4132    /// highest [`RepoGraph::def_ranks`] score).
4133    pub root_def_idx: usize,
4134    /// Number of definitions in this cluster.
4135    pub size: usize,
4136    /// Sum of `end_line - start_line` for every member definition.
4137    pub total_lines: usize,
4138    /// All member global def indices, root first.
4139    pub member_def_indices: Vec<usize>,
4140}
4141
4142/// Confidence level for [`DeadCodeReport::dead_fraction`].
4143///
4144/// Callers should interpret `dead_fraction` according to this indicator.
4145/// When entry-point coverage is sparse (common in test-heavy corpora or
4146/// macro-dispatched frameworks), the raw fraction can be misleading — the
4147/// cluster list is always more trustworthy than the absolute number.
4148///
4149/// ## Rubric
4150///
4151/// | Level  | Condition                                                              |
4152/// |--------|------------------------------------------------------------------------|
4153/// | High   | entry coverage ≥ 10 % AND ≥ 1 LibraryExport AND ≥ 1 Main              |
4154/// | Medium | entry coverage ≥ 2 % AND ≥ 1 production entry AND tests not dominant  |
4155/// | Low    | entry coverage < 2 % OR tests > 80 % of all entries                   |
4156///
4157/// "Production entries" = LibraryExport + FrameworkDispatched + Main + Ffi.
4158#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
4159#[serde(rename_all = "snake_case")]
4160pub enum DeadCodeConfidence {
4161    /// Entry-point coverage is dense; `dead_fraction` is a quantitative signal.
4162    High,
4163    /// `dead_fraction` is directional; trust the cluster list more than the number.
4164    Medium,
4165    /// `dead_fraction` may mislead (e.g., tests-dominated corpus); use cluster
4166    /// `member_defs` + `lsp_references` to verify each candidate before deletion.
4167    Low,
4168}
4169
4170/// Compute a [`DeadCodeConfidence`] level from entry-point kind counts.
4171///
4172/// Parameters mirror the counts emitted by
4173/// [`crate::entry_points::summarize_entry_point_kinds`]:
4174///
4175/// - `total_defs` — total number of definitions in the graph.
4176/// - `library_exports` — count of [`crate::entry_points::EntryPointKind::LibraryExport`] entries.
4177/// - `framework_dispatched` — count of [`crate::entry_points::EntryPointKind::FrameworkDispatched`] entries.
4178/// - `main_entries` — count of [`crate::entry_points::EntryPointKind::Main`] entries.
4179/// - `test_entries` — count of [`crate::entry_points::EntryPointKind::Test`] entries.
4180/// - `ffi_entries` — count of [`crate::entry_points::EntryPointKind::Ffi`] entries.
4181///
4182/// ProcMacro / Init / BuildScript entries are not passed separately; they are
4183/// rare enough that their absence does not materially affect the heuristic.
4184#[must_use]
4185#[expect(
4186    clippy::cast_precision_loss,
4187    reason = "entry and def counts fit comfortably in f64 for ratio computation"
4188)]
4189pub fn compute_confidence(
4190    total_defs: usize,
4191    library_exports: usize,
4192    framework_dispatched: usize,
4193    main_entries: usize,
4194    test_entries: usize,
4195    ffi_entries: usize,
4196) -> DeadCodeConfidence {
4197    let total_entries =
4198        library_exports + framework_dispatched + main_entries + test_entries + ffi_entries;
4199    let entry_coverage = if total_defs > 0 {
4200        total_entries as f64 / total_defs as f64
4201    } else {
4202        0.0
4203    };
4204    let test_dominant = total_entries > 0 && (test_entries as f64 / total_entries as f64) > 0.8;
4205    let production_entries = library_exports + framework_dispatched + main_entries + ffi_entries;
4206
4207    if entry_coverage >= 0.10 && library_exports >= 1 && main_entries >= 1 {
4208        DeadCodeConfidence::High
4209    } else if entry_coverage >= 0.02 && production_entries >= 1 && !test_dominant {
4210        DeadCodeConfidence::Medium
4211    } else {
4212        DeadCodeConfidence::Low
4213    }
4214}
4215
4216/// Summary report from [`compute_dead_code`].
4217///
4218/// The primary consumer is X3's `mcp__ripvec__find_dead_code` MCP tool.
4219#[derive(Debug, Clone)]
4220pub struct DeadCodeReport {
4221    /// Dead clusters sorted by size descending (largest first).
4222    pub dead_clusters: Vec<DeadCluster>,
4223    /// Total number of definitions unreachable from any entry point.
4224    pub total_dead_defs: usize,
4225    /// Total number of definitions reachable from at least one entry point.
4226    pub total_live_defs: usize,
4227    /// Fraction of all definitions that are dead: `dead / (dead + live)`.
4228    pub dead_fraction: f32,
4229    /// Confidence that `dead_fraction` is a reliable signal.
4230    ///
4231    /// Defaults to [`DeadCodeConfidence::Low`] until the caller supplies
4232    /// entry-point kind counts via [`compute_confidence`].
4233    pub confidence: DeadCodeConfidence,
4234}
4235
4236/// Returns true if the given path is a test-related path.
4237///
4238/// Heuristic: the path contains `tests/`, `/spec/`, `/specs/`, or the
4239/// file stem starts/ends with `test_`/`_test` or contains `bench`.
4240fn is_test_path(path: &str) -> bool {
4241    let path_lc = path.to_lowercase();
4242    if path_lc.contains("tests/") || path_lc.contains("/spec/") || path_lc.contains("/specs/") {
4243        return true;
4244    }
4245    let file_name = path.rsplit('/').next().unwrap_or(path);
4246    let stem = file_name.split('.').next().unwrap_or(file_name);
4247    stem.starts_with("test_") || stem.ends_with("_test") || stem.contains("bench")
4248}
4249
4250/// Resolve a flat [`DefIndex`] to its file index using the `def_offsets`
4251/// prefix-sum table.
4252fn flat_to_file_idx(offsets: &[usize], flat: DefIndex) -> usize {
4253    offsets.partition_point(|&o| o <= flat).saturating_sub(1)
4254}
4255
4256/// Union-find: path-compressing find. Returns the root representative of `x`.
4257fn uf_find(parent: &mut Vec<usize>, x: usize) -> usize {
4258    if parent[x] != x {
4259        parent[x] = uf_find(parent, parent[x]);
4260    }
4261    parent[x]
4262}
4263
4264/// Union-find: merge the components containing `x` and `y`.
4265fn uf_union(parent: &mut Vec<usize>, x: usize, y: usize) {
4266    let rx = uf_find(parent, x);
4267    let ry = uf_find(parent, y);
4268    if rx != ry {
4269        parent[rx] = ry;
4270    }
4271}
4272
4273/// Compute the set of definitions unreachable from any entry point.
4274///
4275/// Returns dead clusters (connected components in the unreachable subgraph),
4276/// sorted by size descending.
4277///
4278/// # Algorithm
4279///
4280/// 1. Optionally filter test paths from `entry_def_indices` (when
4281///    `include_test_paths` is `false`). Test-path heuristic: the file path
4282///    contains `test_`, `_test`, `tests/`, `spec/`, `specs/`, or `bench`.
4283/// 2. Build full forward + reverse adjacency from [`RepoGraph::def_edges`]
4284///    (the untruncated edge list) as a Compressed Sparse Row (CSR) pair
4285///    of (`starts`, `dst`) `Vec<u32>` arrays — bounded O(E) memory rather
4286///    than the O(n_defs * avg_fanout) of duplicated `Vec<Vec<DefIndex>>`
4287///    that crashed at kernel scale (I#61). The BFS does NOT use
4288///    [`RepoGraph::def_callees`] / [`RepoGraph::def_callers`]: those are
4289///    display-oriented neighbor lists capped at [`MAX_NEIGHBORS`] per node,
4290///    which dropped hub callees from the live set and produced
4291///    false-positive dead reports (I#57).
4292/// 3. BFS forward over the full forward adjacency from the entry seeds ->
4293///    reachable set.
4294/// 4. Complement `(all_defs - reachable)` = dead set.
4295/// 5. Connected-components on the dead subgraph via the full forward +
4296///    reverse adjacency treated as undirected (union-find).
4297/// 5. For each cluster: pick the highest-rank def as the cluster root;
4298///    `size` = member count; `total_lines` = sum of `(end_line -
4299///    start_line)` per member.
4300/// 6. Sort clusters by size descending.
4301#[must_use]
4302#[expect(
4303    clippy::too_many_lines,
4304    reason = "six-step BFS+clustering pipeline; splitting into sub-functions \
4305              would require passing many interdependent Vec borrows with no \
4306              clarity gain"
4307)]
4308pub fn compute_dead_code<S: std::hash::BuildHasher>(
4309    graph: &RepoGraph,
4310    entry_def_indices: &HashSet<DefIndex, S>,
4311    include_test_paths: bool,
4312) -> DeadCodeReport {
4313    let n_defs = graph.def_ranks.len();
4314    if n_defs == 0 {
4315        return DeadCodeReport {
4316            dead_clusters: vec![],
4317            total_dead_defs: 0,
4318            total_live_defs: 0,
4319            dead_fraction: 0.0,
4320            confidence: DeadCodeConfidence::Low,
4321        };
4322    }
4323
4324    // Step 1: build the effective seed set.
4325    let seeds: Vec<DefIndex> = if include_test_paths {
4326        entry_def_indices.iter().copied().collect()
4327    } else {
4328        entry_def_indices
4329            .iter()
4330            .copied()
4331            .filter(|&flat| {
4332                let file_idx = flat_to_file_idx(&graph.def_offsets, flat);
4333                let path = graph
4334                    .files
4335                    .get(file_idx)
4336                    .map(|f| f.path.as_str())
4337                    .unwrap_or("");
4338                !is_test_path(path)
4339            })
4340            .collect()
4341    };
4342
4343    // Build CSR-style forward + reverse adjacency from `graph.def_edges`.
4344    //
4345    // We do NOT use `graph.def_callees` / `graph.def_callers` here because
4346    // those are display-oriented neighbor lists truncated to MAX_NEIGHBORS
4347    // in `build_def_neighbor_lists`. Reading a truncated forward list during
4348    // BFS produced false-positive dead reports (I#57): a hub function with
4349    // more than 5 outgoing call edges silently dropped its low-weight
4350    // callees from the live set, and every helper transitively reachable
4351    // only through those dropped edges was marked dead.
4352    //
4353    // The original I#57 fix materialized `full_callees` / `full_callers` as
4354    // two `Vec<Vec<DefIndex>>` tables, which at kernel scale (n_defs ≈
4355    // 600 K with high-fan-out hubs) cost ~1.9 GB and OOM-crashed the MCP
4356    // server on every parameter variant (I#61, regression).
4357    //
4358    // This implementation uses Compressed Sparse Row (CSR) bucketing:
4359    // - `fwd_starts[i..=i+1]` slices `fwd_dst` to the destinations of node i.
4360    // - `rev_starts[i..=i+1]` slices `rev_src` to the sources that call i.
4361    // Storage is bounded O(E) (one u32 per directed edge × 2 directions)
4362    // plus O(n_defs) for the two start arrays. For a 600 K-def / 5 M-edge
4363    // kernel graph this is ~80 MB rather than ~1.9 GB.
4364    //
4365    // u32 is sufficient: every flat DefIndex value is < n_defs and ripvec
4366    // already constrains `n_defs <= u32::MAX` upstream (DefId.0 is u32).
4367    #[expect(
4368        clippy::cast_possible_truncation,
4369        reason = "n_defs <= u32::MAX is a ripvec-wide invariant; the per-edge \
4370                  cast cannot overflow"
4371    )]
4372    let (fwd_starts, fwd_dst, rev_starts, rev_src) = {
4373        // First pass: per-node out-degree (forward) and in-degree (reverse).
4374        let mut fwd_deg: Vec<u32> = vec![0; n_defs];
4375        let mut rev_deg: Vec<u32> = vec![0; n_defs];
4376        for &(src, dst, _w) in &graph.def_edges {
4377            let src_flat = graph.def_offsets[src.0 as usize] + src.1 as usize;
4378            let dst_flat = graph.def_offsets[dst.0 as usize] + dst.1 as usize;
4379            if src_flat < n_defs && dst_flat < n_defs {
4380                fwd_deg[src_flat] += 1;
4381                rev_deg[dst_flat] += 1;
4382            }
4383        }
4384        // Exclusive prefix sums -> bucket starts of length n_defs + 1.
4385        let mut fwd_starts: Vec<u32> = Vec::with_capacity(n_defs + 1);
4386        let mut rev_starts: Vec<u32> = Vec::with_capacity(n_defs + 1);
4387        let mut acc_f: u32 = 0;
4388        let mut acc_r: u32 = 0;
4389        fwd_starts.push(0);
4390        rev_starts.push(0);
4391        for i in 0..n_defs {
4392            acc_f = acc_f.saturating_add(fwd_deg[i]);
4393            acc_r = acc_r.saturating_add(rev_deg[i]);
4394            fwd_starts.push(acc_f);
4395            rev_starts.push(acc_r);
4396        }
4397        // Second pass: place each edge using a per-node cursor.
4398        let total_edges = acc_f as usize;
4399        let mut fwd_dst: Vec<u32> = vec![0u32; total_edges];
4400        let mut rev_src: Vec<u32> = vec![0u32; total_edges];
4401        // Reuse fwd_deg / rev_deg as cursors counting down from per-bucket capacity.
4402        // Easier: cursors start at the bucket start.
4403        let mut fwd_cursor: Vec<u32> = fwd_starts[..n_defs].to_vec();
4404        let mut rev_cursor: Vec<u32> = rev_starts[..n_defs].to_vec();
4405        for &(src, dst, _w) in &graph.def_edges {
4406            let src_flat = graph.def_offsets[src.0 as usize] + src.1 as usize;
4407            let dst_flat = graph.def_offsets[dst.0 as usize] + dst.1 as usize;
4408            if src_flat < n_defs && dst_flat < n_defs {
4409                let f = fwd_cursor[src_flat] as usize;
4410                fwd_dst[f] = dst_flat as u32;
4411                fwd_cursor[src_flat] += 1;
4412                let r = rev_cursor[dst_flat] as usize;
4413                rev_src[r] = src_flat as u32;
4414                rev_cursor[dst_flat] += 1;
4415            }
4416        }
4417        (fwd_starts, fwd_dst, rev_starts, rev_src)
4418    };
4419
4420    // Helper closures: O(1) bucket lookup into the CSR forward / reverse views.
4421    let fwd_callees = |flat: DefIndex| -> &[u32] {
4422        let lo = fwd_starts[flat] as usize;
4423        let hi = fwd_starts[flat + 1] as usize;
4424        &fwd_dst[lo..hi]
4425    };
4426    let rev_callers = |flat: DefIndex| -> &[u32] {
4427        let lo = rev_starts[flat] as usize;
4428        let hi = rev_starts[flat + 1] as usize;
4429        &rev_src[lo..hi]
4430    };
4431
4432    // Step 2: BFS forward over the CSR forward view from seeds -> reachable set.
4433    let mut reachable: Vec<bool> = vec![false; n_defs];
4434    let mut queue: std::collections::VecDeque<DefIndex> = std::collections::VecDeque::new();
4435
4436    for seed in &seeds {
4437        if *seed < n_defs && !reachable[*seed] {
4438            reachable[*seed] = true;
4439            queue.push_back(*seed);
4440        }
4441    }
4442
4443    while let Some(flat) = queue.pop_front() {
4444        for &callee_flat_u32 in fwd_callees(flat) {
4445            let callee_flat = callee_flat_u32 as usize;
4446            if callee_flat < n_defs && !reachable[callee_flat] {
4447                reachable[callee_flat] = true;
4448                queue.push_back(callee_flat);
4449            }
4450        }
4451    }
4452
4453    // Step 3: dead set = all_defs - reachable.
4454    let dead: Vec<DefIndex> = (0..n_defs).filter(|&i| !reachable[i]).collect();
4455    let total_dead_defs = dead.len();
4456    let total_live_defs = n_defs - total_dead_defs;
4457
4458    if dead.is_empty() {
4459        return DeadCodeReport {
4460            dead_clusters: vec![],
4461            total_dead_defs: 0,
4462            total_live_defs,
4463            dead_fraction: 0.0,
4464            confidence: DeadCodeConfidence::Low,
4465        };
4466    }
4467
4468    // Step 4: connected components on the dead subgraph via union-find.
4469    let dead_set: HashSet<DefIndex> = dead.iter().copied().collect();
4470    let dead_pos: HashMap<DefIndex, usize> = dead
4471        .iter()
4472        .copied()
4473        .enumerate()
4474        .map(|(pos, idx)| (idx, pos))
4475        .collect();
4476    let m = dead.len();
4477    let mut parent: Vec<usize> = (0..m).collect();
4478
4479    // Use the same CSR forward / reverse views built above for the BFS so
4480    // the dead subgraph's connected components reflect every actual call
4481    // edge — not the [`MAX_NEIGHBORS`]-truncated display view (I#57). The CSR
4482    // layout keeps storage O(E) at kernel scale rather than the duplicated
4483    // O(n_defs * avg_fanout) of the pre-I#61 `Vec<Vec<DefIndex>>` (I#61).
4484    for &flat in &dead {
4485        let pos_flat = dead_pos[&flat];
4486        for &callee_flat_u32 in fwd_callees(flat) {
4487            let callee_flat = callee_flat_u32 as usize;
4488            if dead_set.contains(&callee_flat) {
4489                let pos_callee = dead_pos[&callee_flat];
4490                uf_union(&mut parent, pos_flat, pos_callee);
4491            }
4492        }
4493        for &caller_flat_u32 in rev_callers(flat) {
4494            let caller_flat = caller_flat_u32 as usize;
4495            if dead_set.contains(&caller_flat) {
4496                let pos_caller = dead_pos[&caller_flat];
4497                uf_union(&mut parent, pos_flat, pos_caller);
4498            }
4499        }
4500    }
4501
4502    // Flatten roots (path compression for all).
4503    for i in 0..m {
4504        uf_find(&mut parent, i);
4505    }
4506
4507    // Group members by their component root.
4508    let mut components: HashMap<usize, Vec<DefIndex>> = HashMap::new();
4509    for (pos, &flat) in dead.iter().enumerate() {
4510        let root_pos = parent[pos];
4511        components.entry(root_pos).or_default().push(flat);
4512    }
4513
4514    // Step 5: build clusters - root is the highest-rank member.
4515    let mut clusters: Vec<DeadCluster> = components
4516        .into_values()
4517        .map(|members| {
4518            let root_flat = members
4519                .iter()
4520                .copied()
4521                .max_by(|&a, &b| {
4522                    let ra = graph.def_ranks.get(a).copied().unwrap_or(0.0);
4523                    let rb = graph.def_ranks.get(b).copied().unwrap_or(0.0);
4524                    ra.total_cmp(&rb)
4525                })
4526                .unwrap_or(members[0]);
4527
4528            let total_lines: usize = members
4529                .iter()
4530                .copied()
4531                .map(|flat| {
4532                    let file_idx = flat_to_file_idx(&graph.def_offsets, flat);
4533                    let def_idx = flat - graph.def_offsets[file_idx];
4534                    let def = graph.files.get(file_idx).and_then(|f| f.defs.get(def_idx));
4535                    def.map(|d| (d.end_line as usize).saturating_sub(d.start_line as usize))
4536                        .unwrap_or(0)
4537                })
4538                .sum();
4539
4540            let mut member_def_indices: Vec<usize> = std::iter::once(root_flat)
4541                .chain(members.iter().copied().filter(|&m| m != root_flat))
4542                .collect();
4543            if member_def_indices.len() > 1 {
4544                member_def_indices[1..].sort_unstable();
4545            }
4546
4547            DeadCluster {
4548                root_def_idx: root_flat,
4549                size: member_def_indices.len(),
4550                total_lines,
4551                member_def_indices,
4552            }
4553        })
4554        .collect();
4555
4556    // Step 6: sort by size descending.
4557    clusters.sort_by(|a, b| {
4558        b.size
4559            .cmp(&a.size)
4560            .then(b.root_def_idx.cmp(&a.root_def_idx))
4561    });
4562
4563    #[expect(
4564        clippy::cast_precision_loss,
4565        reason = "def counts fit comfortably in f32"
4566    )]
4567    let dead_fraction = if n_defs > 0 {
4568        total_dead_defs as f32 / n_defs as f32
4569    } else {
4570        0.0
4571    };
4572
4573    DeadCodeReport {
4574        dead_clusters: clusters,
4575        total_dead_defs,
4576        total_live_defs,
4577        dead_fraction,
4578        // Caller must set confidence via compute_confidence() after supplying
4579        // entry-point kind counts. Defaulting to Low is conservative.
4580        confidence: DeadCodeConfidence::Low,
4581    }
4582}
4583
4584impl RepoGraph {
4585    /// Get the `PageRank` score for a specific definition.
4586    #[must_use]
4587    pub fn def_rank(&self, did: DefId) -> f32 {
4588        let flat = self.def_offsets[did.0 as usize] + did.1 as usize;
4589        self.def_ranks.get(flat).copied().unwrap_or(0.0)
4590    }
4591
4592    /// Look up a definition by file path and name. Returns the first match.
4593    #[must_use]
4594    pub fn find_def(&self, file_path: &str, def_name: &str) -> Option<DefId> {
4595        for (file_idx, file) in self.files.iter().enumerate() {
4596            if file.path == file_path {
4597                for (def_idx, def) in file.defs.iter().enumerate() {
4598                    if def.name == def_name {
4599                        #[expect(clippy::cast_possible_truncation)]
4600                        return Some((file_idx as u32, def_idx as u16));
4601                    }
4602                }
4603            }
4604        }
4605        None
4606    }
4607
4608    /// Resolve a caller-supplied `focus_file` string to a file index in [`Self::files`].
4609    ///
4610    /// Accepts any of the path forms that ripvec itself emits or accepts:
4611    ///
4612    /// - **Exact stored path** (`device_opt/services/storage.py`) — direct match.
4613    /// - **LSP-shaped path** (`./device_opt/services/storage.py`) — the `./`
4614    ///   prefix used by every [`RepoMapLspLocation::file_path`] is stripped
4615    ///   before comparison so the documented chaining pattern
4616    ///   `get_repo_map(focus_file=hits[0].lsp_location.file_path)` works.
4617    /// - **Strict suffix** (`storage.py`, `services/storage.py`) — match when
4618    ///   the previous character in the stored path is `/`. Avoids matching
4619    ///   `foo_storage.py` for `storage.py`.
4620    ///
4621    /// Returns [`FocusResolution::Found`] when exactly one file matches,
4622    /// [`FocusResolution::Ambiguous`] when multiple files match (the caller
4623    /// surfaces the candidate list to the user), and [`FocusResolution::NotFound`]
4624    /// when no file matches.
4625    ///
4626    /// # Background
4627    ///
4628    /// Prior to this helper the MCP layer (`crates/ripvec-mcp/src/tools.rs`)
4629    /// did the matching inline with two bugs:
4630    ///
4631    /// 1. **`./` prefix mismatch.** [`RepoMapLspLocation::file_path`] always
4632    ///    carries a leading `./` (see [`file_lsp_location`]), but
4633    ///    [`FileNode::path`] does not. Passing the LSP location verbatim as
4634    ///    `focus_file` matched zero files. The matcher silently returned
4635    ///    `focus = None`, producing rank values bit-identical to the unfocused
4636    ///    call — the bug originally reported as "I#20 focus_file rebias
4637    ///    invisible on Python".
4638    /// 2. **Equal-length false negative.** When the user passed
4639    ///    `./device_opt/services/storage.py` and the stored path was
4640    ///    `device_opt/services/storage.py`, `exact` was false (the strings
4641    ///    differ by two bytes) and `strict_suffix` was false (the focus is
4642    ///    longer than the stored path, so `p.len() > focus.len()` fails). The
4643    ///    pathology surfaced specifically when the focus was a *full* path
4644    ///    with the LSP `./` prefix.
4645    ///
4646    /// Centralising the resolution here gives every caller the same
4647    /// normalization-tolerant semantics and one place to test the contract.
4648    #[must_use]
4649    pub fn resolve_focus_file(&self, focus: &str) -> FocusResolution {
4650        let normalized = normalize_focus_path(focus);
4651        let matches: Vec<usize> = self
4652            .files
4653            .iter()
4654            .enumerate()
4655            .filter_map(|(idx, f)| {
4656                if focus_matches_path(&f.path, normalized) {
4657                    Some(idx)
4658                } else {
4659                    None
4660                }
4661            })
4662            .collect();
4663        match matches.len() {
4664            0 => FocusResolution::NotFound,
4665            1 => FocusResolution::Found(matches[0]),
4666            _ => FocusResolution::Ambiguous(
4667                matches
4668                    .into_iter()
4669                    .map(|i| self.files[i].path.clone())
4670                    .collect(),
4671            ),
4672        }
4673    }
4674}
4675
4676/// Result of resolving a user-supplied `focus_file` string against a [`RepoGraph`].
4677///
4678/// See [`RepoGraph::resolve_focus_file`] for the resolution semantics and the
4679/// historical bug that motivated the helper.
4680#[derive(Debug, Clone)]
4681pub enum FocusResolution {
4682    /// Exactly one file matched. Carries the file index in [`RepoGraph::files`].
4683    Found(usize),
4684    /// No file matched. The caller treats this as an unfocused call.
4685    NotFound,
4686    /// Two or more files matched. The caller surfaces the candidate list so
4687    /// the user can disambiguate by passing a longer suffix or the full path.
4688    Ambiguous(Vec<String>),
4689}
4690
4691/// Strip the leading `./` prefix from a focus_file path.
4692///
4693/// The `./` form is produced by [`file_lsp_location`] for every
4694/// [`RepoMapLspLocation::file_path`] field on a relative path. Stripping it
4695/// gives a stored-path-shaped value for the suffix matcher to compare
4696/// against [`FileNode::path`] entries (which do not carry the prefix).
4697///
4698/// Absolute paths (`/abs/path/file.py`) are returned unchanged; they will
4699/// fail the suffix match against the relative stored paths, which is the
4700/// correct behavior (the caller meant a different root entirely).
4701fn normalize_focus_path(focus: &str) -> &str {
4702    focus.strip_prefix("./").unwrap_or(focus)
4703}
4704
4705/// Return true when `focus` matches `stored_path` as either an exact path or
4706/// a strict-suffix (must be preceded by `/`). The empty focus does not match.
4707fn focus_matches_path(stored_path: &str, focus: &str) -> bool {
4708    if focus.is_empty() {
4709        return false;
4710    }
4711    if stored_path == focus {
4712        return true;
4713    }
4714    stored_path.len() > focus.len()
4715        && stored_path.ends_with(focus)
4716        && stored_path.as_bytes()[stored_path.len() - focus.len() - 1] == b'/'
4717}
4718
4719/// Build top-N caller and callee lists for each file.
4720///
4721/// Given a list of weighted directed edges `(src, dst, weight)` over `n`
4722/// nodes, returns `(callers[i], callees[i])` for each node `i`, where each
4723/// list contains the top-[`MAX_NEIGHBORS`] adjacent nodes sorted by descending
4724/// edge weight.
4725///
4726/// Exposed as `pub` so that integration tests can construct synthetic
4727/// [`RepoGraph`] instances for unit-testing the JSON rendering without going
4728/// through a full disk walk.
4729#[must_use]
4730pub fn build_neighbor_lists(n: usize, edges: &[(u32, u32, u32)]) -> (Vec<Vec<u32>>, Vec<Vec<u32>>) {
4731    let mut incoming: Vec<Vec<(u32, u32)>> = vec![vec![]; n];
4732    let mut outgoing: Vec<Vec<(u32, u32)>> = vec![vec![]; n];
4733
4734    for &(src, dst, w) in edges {
4735        let (s, d) = (src as usize, dst as usize);
4736        if s < n && d < n {
4737            incoming[d].push((src, w));
4738            outgoing[s].push((dst, w));
4739        }
4740    }
4741
4742    // Sort by weight descending, keep top N
4743    let trim = |lists: &mut [Vec<(u32, u32)>]| -> Vec<Vec<u32>> {
4744        lists
4745            .iter_mut()
4746            .map(|list| {
4747                list.sort_by_key(|b| std::cmp::Reverse(b.1));
4748                list.iter()
4749                    .take(MAX_NEIGHBORS)
4750                    .map(|(idx, _)| *idx)
4751                    .collect()
4752            })
4753            .collect()
4754    };
4755
4756    (trim(&mut incoming), trim(&mut outgoing))
4757}
4758
4759// ── Rendering ────────────────────────────────────────────────────────
4760
4761/// Render a budget-constrained overview of the repository.
4762///
4763/// Files are sorted by `PageRank` (or topic-sensitive rank if `focus` is
4764/// `Some`). Output uses four tiers of decreasing detail:
4765///
4766/// - **Tier 0** (top 10%): full path, rank, callers/callees, signatures with scopes
4767/// - **Tier 1** (next 20%): full path, rank, signatures
4768/// - **Tier 2** (next 40%): full path, rank, definition names and kinds
4769/// - **Tier 3** (bottom 30%): file path only
4770///
4771/// Stops accumulating output when the estimated token count exceeds
4772/// `max_tokens`.
4773#[must_use]
4774pub fn render(graph: &RepoGraph, max_tokens: usize, focus: Option<usize>) -> String {
4775    let n = graph.files.len();
4776    if n == 0 {
4777        return String::new();
4778    }
4779
4780    // Compute ranks (recompute topic-sensitive if focus is given)
4781    let ranks = if focus.is_some() {
4782        pagerank(n, &graph.edges, focus)
4783    } else {
4784        graph.base_ranks.clone()
4785    };
4786
4787    // Sort file indices by rank descending
4788    let mut sorted: Vec<usize> = (0..n).collect();
4789    sorted.sort_by(|&a, &b| ranks[b].total_cmp(&ranks[a]));
4790
4791    let mut output = String::new();
4792    let mut used_tokens = 0;
4793    let max_chars = max_tokens * CHARS_PER_TOKEN;
4794
4795    for (rank_pos, &file_idx) in sorted.iter().enumerate() {
4796        if used_tokens >= max_tokens {
4797            break;
4798        }
4799
4800        let file = &graph.files[file_idx];
4801        let score = ranks[file_idx];
4802        #[expect(clippy::cast_precision_loss, reason = "file counts fit in f32")]
4803        let percentile = (rank_pos as f32) / (n as f32);
4804
4805        let section = if percentile < 0.1 {
4806            render_tier0(graph, file_idx, file, score)
4807        } else if percentile < 0.3 {
4808            render_tier1(file, score)
4809        } else if percentile < 0.7 {
4810            render_tier2(file, score)
4811        } else {
4812            render_tier3(file)
4813        };
4814
4815        let section_chars = section.len();
4816        if used_tokens > 0 && used_tokens + section_chars / CHARS_PER_TOKEN > max_tokens {
4817            // Would exceed budget — try to fit at least the path
4818            let path_line = format!("{}\n", file.path);
4819            let path_tokens = path_line.len() / CHARS_PER_TOKEN;
4820            if used_tokens + path_tokens <= max_tokens {
4821                output.push_str(&path_line);
4822            }
4823            break;
4824        }
4825
4826        output.push_str(&section);
4827        used_tokens = output.len().min(max_chars) / CHARS_PER_TOKEN;
4828    }
4829
4830    output
4831}
4832
4833/// Render tier 0: full detail with callers, callees, and signatures.
4834fn render_tier0(graph: &RepoGraph, file_idx: usize, file: &FileNode, score: f32) -> String {
4835    let mut out = format!("## {} (rank: {score:.4})\n", file.path);
4836
4837    // Callers
4838    if file_idx < graph.callers.len() && !graph.callers[file_idx].is_empty() {
4839        let _ = write!(out, "  called by: ");
4840        let names: Vec<&str> = graph.callers[file_idx]
4841            .iter()
4842            .filter_map(|&idx| graph.files.get(idx as usize).map(|f| f.path.as_str()))
4843            .collect();
4844        let _ = writeln!(out, "{}", names.join(", "));
4845    }
4846
4847    // Callees
4848    if file_idx < graph.callees.len() && !graph.callees[file_idx].is_empty() {
4849        let _ = write!(out, "  calls: ");
4850        let names: Vec<&str> = graph.callees[file_idx]
4851            .iter()
4852            .filter_map(|&idx| graph.files.get(idx as usize).map(|f| f.path.as_str()))
4853            .collect();
4854        let _ = writeln!(out, "{}", names.join(", "));
4855    }
4856
4857    // Definitions with scope and signature
4858    for def in &file.defs {
4859        let scope_prefix = if def.scope.is_empty() {
4860            String::new()
4861        } else {
4862            format!("{} > ", def.scope)
4863        };
4864        if let Some(sig) = &def.signature {
4865            let _ = writeln!(out, "  {scope_prefix}{} {sig}", def.kind);
4866        } else {
4867            let _ = writeln!(out, "  {scope_prefix}{} {}", def.kind, def.name);
4868        }
4869    }
4870    let _ = writeln!(out);
4871    out
4872}
4873
4874/// Render tier 1: file path, rank, and signatures.
4875fn render_tier1(file: &FileNode, score: f32) -> String {
4876    let mut out = format!("## {} (rank: {score:.4})\n", file.path);
4877    for def in &file.defs {
4878        if let Some(sig) = &def.signature {
4879            let _ = writeln!(out, "  {sig}");
4880        } else {
4881            let _ = writeln!(out, "  {} {}", def.kind, def.name);
4882        }
4883    }
4884    let _ = writeln!(out);
4885    out
4886}
4887
4888/// Render tier 2: file path, rank, and definition names/kinds.
4889fn render_tier2(file: &FileNode, score: f32) -> String {
4890    let mut out = format!("{} (rank: {score:.4})", file.path);
4891    if !file.defs.is_empty() {
4892        let names: Vec<String> = file
4893            .defs
4894            .iter()
4895            .map(|d| format!("{}:{}", d.kind, d.name))
4896            .collect();
4897        let _ = write!(out, " -- {}", names.join(", "));
4898    }
4899    let _ = writeln!(out);
4900    out
4901}
4902
4903/// Render tier 3: file path only.
4904fn render_tier3(file: &FileNode) -> String {
4905    format!("{}\n", file.path)
4906}
4907
4908// ── JSON rendering ───────────────────────────────────────────────────
4909
4910/// Build the `lsp_location` for a file itself (line 0).
4911fn file_lsp_location(path: &str) -> RepoMapLspLocation {
4912    RepoMapLspLocation {
4913        file_path: if path.starts_with("./") || path.starts_with('/') {
4914            path.to_string()
4915        } else {
4916            format!("./{path}")
4917        },
4918        start_line: 0,
4919        start_character: 0,
4920        end_line: 0,
4921        end_character: 0,
4922    }
4923}
4924
4925/// Infer `ContentKind` from a file path's extension.
4926fn content_kind_for_path(path: &str) -> ContentKind {
4927    let ext = std::path::Path::new(path)
4928        .extension()
4929        .and_then(|e| e.to_str())
4930        .unwrap_or("");
4931    ContentKind::from_extension(ext)
4932}
4933
4934/// Minimum byte envelope reserved for each included file.
4935///
4936/// Even a file with zero symbols takes JSON overhead for path, rank, arrays,
4937/// etc. Calibrated against actual serde_json output for an empty `RepoMapFile`:
4938/// `{"lsp_location":{"file_path":"./src/file_N.rs","start_line":0,"start_character":0,`
4939/// `"end_line":0,"end_character":0},"rank":0.1234,"content_kind":"code",`
4940/// `"calls":[],"symbols":[],"truncated_symbols":0,"truncated_calls":0}` ≈ 250 bytes.
4941///
4942/// This floor prevents the budget allocator from giving a file so little space
4943/// that it can emit no envelope at all.
4944const FILE_ENVELOPE_MIN_BYTES: usize = 250;
4945
4946/// Minimum useful payload for an admitted file: envelope plus room for at
4947/// least 2-3 typical-sized symbols. Files whose fair share cannot meet this
4948/// floor are excluded entirely (Fix A, 4.0.2). Without this guard, low-rank
4949/// tail files consume budget on envelopes that contain no symbols or calls,
4950/// crowding out content for the top files.
4951const FILE_MIN_USEFUL_BYTES: usize = 600;
4952
4953/// Fraction of each file's per-file budget reserved for outgoing-call edges
4954/// after the envelope is paid. The remaining (1 - this) fraction goes to
4955/// symbols. Symbol leftover flows into calls; call leftover flows to the
4956/// next file. (Fix C, 4.0.2 — without a reserve, the symbol loop saturates
4957/// the per-file budget and calls always come up empty.)
4958///
4959/// This fraction is the FLOOR: if the stored callees' exact byte cost exceeds
4960/// `post_envelope * CALLS_BUDGET_FRACTION`, the calls reserve is raised to the
4961/// exact cost (capped at `CALLS_MAX_FRACTION`). See B-0059 / 4.1.10.
4962const CALLS_BUDGET_FRACTION: f64 = 0.30;
4963
4964/// Maximum fraction of `post_envelope` that may be reserved for calls.
4965///
4966/// Prevents calls from starving symbols entirely when there are many callees
4967/// with long paths. `CALLS_BUDGET_FRACTION` sets a floor; this sets a ceiling.
4968/// Together they implement the precise-reserve strategy (B-0059 / 4.1.10):
4969/// the reserve is `clamp(exact_calls_bytes, fraction_floor, fraction_ceil)`.
4970const CALLS_MAX_FRACTION: f64 = 0.70;
4971
4972/// Maximum fraction of the total budget that a single file may claim.
4973///
4974/// Without this cap a single very-high-rank file (e.g. `lib.rs`) could
4975/// consume the entire budget, leaving all other files empty.
4976const MAX_FILE_SHARE: f64 = 0.40;
4977
4978/// AST kind priority for orientation-style symbol ordering. Higher = surface
4979/// earlier. Used when def-level PageRank is degenerate (most ranks near zero)
4980/// to fall back on structural signal rather than noise.
4981///
4982/// The intuition: a reader orienting in a codebase wants to see the file's
4983/// *shape* before its *behaviors*. Types declare shape; functions declare
4984/// behavior; fields and constants are internal detail. This ordering matches
4985/// how humans read code top-down. (Fix B, 4.0.2.)
4986fn ast_kind_priority(kind: &str) -> u32 {
4987    match kind {
4988        // Tier 3: shape — what THIS file is
4989        "trait_item" | "interface" | "trait" => 30,
4990        "struct_item" | "struct" | "class_definition" | "class" => 29,
4991        "enum_item" | "enum" => 28,
4992        "type_item" | "type_alias_declaration" | "type_alias" => 27,
4993        "mod_item" | "module" | "namespace" => 26,
4994        // Tier 2: behavior — what THIS file does
4995        "function_item" | "function_definition" | "function" | "method_definition" => 20,
4996        "impl_item" | "impl" => 19,
4997        // Tier 1: declarations
4998        "const_item" | "const_declaration" | "const" => 10,
4999        "static_item" | "static" => 9,
5000        // Tier 0: internals (fields, variables, parameters)
5001        _ => 0,
5002    }
5003}
5004
5005/// Effective AST priority with corpus-relative rank promotion (4.0.4).
5006///
5007/// Preserves the 4.0.2 AST-priority ordering by default (types first,
5008/// then functions, then fields). When a def's PageRank significantly
5009/// exceeds the corpus median, promotes it up one or two tiers so that
5010/// load-bearing defs surface alongside their declared-tier neighbors.
5011///
5012/// Thresholds are corpus-median multiples (self-calibrating):
5013/// - rank > 4× median       → +1 tier (e.g., hot function joins type tier)
5014/// - rank > 16× median      → +2 tiers (extremely hot def)
5015/// - otherwise              → declared tier preserved
5016///
5017/// On degenerate (flat) rank distributions the median equals the floor,
5018/// nothing crosses threshold, and 4.0.2 AST-priority ordering is fully
5019/// preserved. On informative distributions (post-4.0.3 enrichment),
5020/// hot defs surface proportionally.
5021fn effective_priority(kind: &str, def_rank: f32, promo_1: f32, promo_2: f32) -> u32 {
5022    let base = ast_kind_priority(kind);
5023    // Accumulate promotion tiers as a plain integer to satisfy clippy's
5024    // bool_to_int_with_if lint while preserving branch clarity.
5025    let promo_tiers: u32 = u32::from(def_rank > promo_1) + u32::from(def_rank > promo_2);
5026    // Tier spacing matches ast_kind_priority's 10-unit gaps.
5027    base + promo_tiers * 10
5028}
5029
5030/// Estimate the serialised JSON byte cost of one `RepoMapSymbol`.
5031///
5032/// Calibrated against actual serde_json output. A `RepoMapSymbol` serialises to
5033/// approximately:
5034/// `{"name":"<N>","kind":<K>,"lsp_location":{"file_path":"<P>","start_line":0,`
5035/// `"start_character":0,"end_line":0,"end_character":0},"rank":<R>}`
5036///
5037/// That is ~165 bytes of overhead (braces, keys, fixed-width integers, rank)
5038/// plus the name length and file_path length. We pass the path length
5039/// separately because the path is the same for all symbols in one file.
5040fn estimate_symbol_bytes(name: &str) -> usize {
5041    // 165 bytes overhead + name length.
5042    // The file_path is not included here because it is part of the
5043    // envelope cost accounted separately.
5044    165 + name.len()
5045}
5046
5047/// Estimate the serialised JSON byte cost of one `RepoMapCall`.
5048///
5049/// Each call entry: `{"lsp_location":{"file_path":"<P>","start_line":0,`
5050/// `"start_character":0,"end_line":0,"end_character":0},"rank":<R>}`
5051/// ≈ 120 bytes overhead + path length.
5052fn estimate_call_bytes(target_path: &str) -> usize {
5053    120 + target_path.len()
5054}
5055
5056/// Render a `PageRank`-weighted JSON map with token-budget allocation (4.0.1).
5057///
5058/// # Algorithm
5059///
5060/// **Step 1 — File-share allocation.** Each eligible file receives a byte
5061/// budget proportional to its `base_rank`. The share is capped at 40% of
5062/// `budget_total_bytes` and floored at [`FILE_ENVELOPE_MIN_BYTES`] (200 B).
5063/// Files are included in rank order until the cumulative allocation would
5064/// exceed the total budget.
5065///
5066/// **Step 2 — Per-file symbol fill.** For each included file, symbols are
5067/// walked in def-rank descending order. Inclusion continues until either (a)
5068/// the file's budget share is exhausted (with carry-over of leftover bytes to
5069/// the next file) or (b) a logarithmic attenuation cutoff fires: symbol at
5070/// position `i` (0-based) is included only if its rank ≥ `top_rank /
5071/// (1 + ln(i + 1))`. `calls[]` is filled in target-file base-rank order
5072/// subject to a hard [`MAX_FILE_CALLS`] render cap (no attenuation — I#68,
5073/// 4.1.4) and per-file byte budget. `truncated_symbols` and `truncated_calls`
5074/// track the count of omitted entries.
5075///
5076/// **Step 3 — Response telemetry.** The response includes `estimated_bytes`
5077/// (actual returned content size), `budget_bytes` (`token_budget * 4`),
5078/// and `budget_exhausted` (`total_files > files.len()`).
5079///
5080/// # Arguments
5081///
5082/// - `graph` — the built dependency graph.
5083/// - `token_budget` — caller-specified token budget (× 4 = byte budget).
5084/// - `focus` — optional file index for topic-sensitive `PageRank`.
5085/// - `include_metadata` — when `false` (default), Meta-classified files
5086///   are excluded before ranking.
5087#[must_use]
5088#[expect(
5089    clippy::cast_precision_loss,
5090    reason = "rank sums and counts are small f32/f64; precision loss is acceptable"
5091)]
5092#[expect(
5093    clippy::too_many_lines,
5094    reason = "the three-step allocation algorithm (file-share → symbol-fill → calls-fill) \
5095              is sequential and share state; splitting into helpers would require passing \
5096              mutable slices across three boundaries with no clarity gain"
5097)]
5098pub fn render_json_budgeted(
5099    graph: &RepoGraph,
5100    token_budget: usize,
5101    focus: Option<usize>,
5102    include_metadata: bool,
5103) -> GetRepoMapResponse {
5104    let n = graph.files.len();
5105    if n == 0 {
5106        let budget_bytes = token_budget * CHARS_PER_TOKEN;
5107        return GetRepoMapResponse {
5108            files: vec![],
5109            total_files: 0,
5110            estimated_bytes: 0,
5111            budget_bytes,
5112            budget_exhausted: false,
5113            capped: false,
5114        };
5115    }
5116
5117    let budget_total_bytes = token_budget * CHARS_PER_TOKEN;
5118
5119    // Recompute topic-sensitive ranks if focus is given.
5120    let ranks = if focus.is_some() {
5121        pagerank(n, &graph.edges, focus)
5122    } else {
5123        graph.base_ranks.clone()
5124    };
5125
5126    // Sort all file indices by rank descending.
5127    let mut sorted: Vec<usize> = (0..n).collect();
5128    sorted.sort_by(|&a, &b| ranks[b].total_cmp(&ranks[a]));
5129
5130    // Apply metadata exclusion filter.
5131    let eligible: Vec<usize> = if include_metadata {
5132        sorted
5133    } else {
5134        sorted
5135            .into_iter()
5136            .filter(|&idx| {
5137                let kind = content_kind_for_path(&graph.files[idx].path);
5138                kind != ContentKind::Meta
5139            })
5140            .collect()
5141    };
5142
5143    let total_files = eligible.len();
5144
5145    // ── Corpus-median def-rank thresholds for tier promotion (4.0.4) ────────
5146    //
5147    // Compute once per call (corpus-wide, not per-file) so the threshold is
5148    // self-calibrating: flat distributions (all ranks equal) set median = floor
5149    // and nothing crosses threshold; informative distributions see proportional
5150    // promotion. Using corpus-wide median ensures a hot function in one file is
5151    // judged against the entire corpus, not just its local file peers.
5152    // Use the 75th percentile of nonzero def-ranks as the corpus reference value
5153    // for tier promotion (rather than the 50th percentile / median). The 75th
5154    // percentile is more robust: on a flat distribution most defs cluster near the
5155    // floor, so the 75th percentile is only marginally above the floor (making the
5156    // 4× threshold very selective). On an informative distribution (post-4.0.3
5157    // call-edge enrichment) the 75th percentile is meaningfully above the floor,
5158    // so the same 4× multiplier captures genuinely hot defs without falsely
5159    // promoting slightly-above-floor helpers.
5160    //
5161    // The 50th percentile (lower median) was rejected because on a 10-def corpus
5162    // with max/min ratio 5× the median equals the floor, causing the 4× threshold
5163    // to fire on defs that are only 5× above floor (a low-variance corpus). The
5164    // 75th percentile corrects this without requiring hand-tuned per-corpus magic
5165    // numbers.
5166    let corpus_reference_rank: f32 = {
5167        let mut nonzero: Vec<f32> = graph
5168            .def_ranks
5169            .iter()
5170            .copied()
5171            .filter(|r| *r > 0.0)
5172            .collect();
5173        if nonzero.is_empty() {
5174            0.0
5175        } else {
5176            nonzero.sort_unstable_by(f32::total_cmp);
5177            let n = nonzero.len();
5178            // 75th percentile index: floor(0.75 * (n - 1))
5179            let idx = (3 * (n - 1)) / 4;
5180            nonzero[idx]
5181        }
5182    };
5183    let promo_1_threshold = corpus_reference_rank * 4.0; // +1 tier
5184    let promo_2_threshold = corpus_reference_rank * 16.0; // +2 tiers
5185
5186    // ── Step 1: File-share allocation ────────────────────────────────
5187
5188    // Greedily determine which files fit within the budget, computing each
5189    // file's share as it is added. We must run a two-pass approach:
5190    //   pass A: determine which files are included (cumulative sum check),
5191    //   pass B: fill symbols/calls using final per-file allocations.
5192    //
5193    // The "included" decision is based on the running cumulative sum so that
5194    // the leftover redistribution in step 2 can carry forward correctly.
5195
5196    // Floor-first admission (Fix A, 4.0.2):
5197    //
5198    // Cap admitted file count so each gets at least FILE_MIN_USEFUL_BYTES.
5199    // Below this threshold the response would carry envelopes that contain
5200    // no symbols or calls — pure overhead, no information. Concentrating
5201    // the budget on fewer files with real content is strictly better for
5202    // orientation than dropping envelope sentinels for many files.
5203    let max_admissible = budget_total_bytes / FILE_MIN_USEFUL_BYTES;
5204    let admit_count = eligible.len().min(max_admissible.max(1));
5205
5206    let budget_f64 = budget_total_bytes as f64;
5207
5208    // Pre-compute rank sum across ADMITTED files only (top-N by rank). f64
5209    // to avoid precision loss when summing many small f32 values.
5210    let admitted_rank_sum: f64 = eligible
5211        .iter()
5212        .take(admit_count)
5213        .map(|&idx| f64::from(ranks[idx]))
5214        .sum();
5215    let admitted_rank_sum = if admitted_rank_sum > 0.0 {
5216        admitted_rank_sum
5217    } else {
5218        1.0
5219    };
5220
5221    // Compute per-file budgets. Each admitted file gets at least
5222    // FILE_MIN_USEFUL_BYTES; the proportional-to-rank share is applied on
5223    // top of the floor and capped at MAX_FILE_SHARE.
5224    let mut included_indices: Vec<usize> = Vec::new(); // indices into `eligible`
5225    let mut file_budgets: Vec<usize> = Vec::new();
5226    let mut cumulative_budget: usize = 0;
5227
5228    for (i, &file_idx) in eligible.iter().take(admit_count).enumerate() {
5229        let file_rank = f64::from(ranks[file_idx]);
5230        let raw_share = budget_f64 * file_rank / admitted_rank_sum;
5231        let capped = raw_share.min(budget_f64 * MAX_FILE_SHARE);
5232        // `capped` is non-negative and bounded by budget_f64 (a usize).
5233        #[expect(
5234            clippy::cast_possible_truncation,
5235            clippy::cast_sign_loss,
5236            reason = "capped is non-negative and bounded by budget_total_bytes (a usize)"
5237        )]
5238        let budget_i = (capped as usize).max(FILE_MIN_USEFUL_BYTES);
5239
5240        if cumulative_budget + budget_i > budget_total_bytes && !included_indices.is_empty() {
5241            break;
5242        }
5243        cumulative_budget += budget_i;
5244        included_indices.push(i);
5245        file_budgets.push(budget_i);
5246    }
5247
5248    // ── Step 2: Per-file symbol fill ─────────────────────────────────
5249
5250    let mut result_files: Vec<RepoMapFile> = Vec::with_capacity(included_indices.len());
5251    let mut leftover: usize = 0; // unused bytes carried from previous file
5252
5253    for (slot, &eligible_i) in included_indices.iter().enumerate() {
5254        let file_idx = eligible[eligible_i];
5255        let file = &graph.files[file_idx];
5256        let file_rank = ranks[file_idx];
5257        let file_path_lsp = file_lsp_location(&file.path);
5258
5259        let budget_in = file_budgets[slot] + leftover;
5260
5261        // Pre-compute callee indices here (before the budget split) so the
5262        // precise byte cost of rendering all stored callees is available when
5263        // setting calls_reserve. This is the B-0059 fix (4.1.10): the old code
5264        // placed callee_indices after the symbol loop, so calls_reserve was
5265        // always the fraction-floor — even when the stored callees needed more.
5266        let callee_indices: Vec<usize> = if file_idx < graph.callees.len() {
5267            let mut callees: Vec<(usize, f32)> = graph.callees[file_idx]
5268                .iter()
5269                .filter_map(|&ci| {
5270                    let ci = ci as usize;
5271                    graph.files.get(ci).map(|_| {
5272                        let r = graph.base_ranks.get(ci).copied().unwrap_or(0.0);
5273                        (ci, r)
5274                    })
5275                })
5276                .collect();
5277            callees.sort_unstable_by(|a, b| b.1.total_cmp(&a.1));
5278            callees.into_iter().map(|(ci, _)| ci).collect()
5279        } else {
5280            vec![]
5281        };
5282
5283        // Reserve a fraction of the post-envelope budget for outgoing calls
5284        // (Fix C, 4.0.2). Without this guard the symbol loop saturates
5285        // `budget_in` and the calls loop always trips its byte-check.
5286        // Symbol leftover flows into calls; call leftover flows to the
5287        // next file via the outer `leftover` variable.
5288        //
5289        // B-0059 (4.1.10): raise the reserve to the exact byte cost of
5290        // rendering all stored callees (up to MAX_FILE_CALLS), clamped to
5291        // [fraction_floor, CALLS_MAX_FRACTION * post_envelope]. This ensures
5292        // that at any budget where calls physically fit, they are not crowded
5293        // out by symbols consuming the fraction-floor reserve.
5294        let post_envelope = budget_in.saturating_sub(FILE_ENVELOPE_MIN_BYTES);
5295        let precise_calls_bytes: usize = callee_indices
5296            .iter()
5297            .take(MAX_FILE_CALLS)
5298            .map(|&ci| estimate_call_bytes(&graph.files[ci].path))
5299            .sum();
5300        #[expect(
5301            clippy::cast_possible_truncation,
5302            clippy::cast_sign_loss,
5303            reason = "post_envelope * fraction is bounded by post_envelope (a usize); result is non-negative"
5304        )]
5305        let fraction_floor = (post_envelope as f64 * CALLS_BUDGET_FRACTION) as usize;
5306        #[expect(
5307            clippy::cast_possible_truncation,
5308            clippy::cast_sign_loss,
5309            reason = "post_envelope * fraction is bounded by post_envelope (a usize); result is non-negative"
5310        )]
5311        let fraction_ceil = (post_envelope as f64 * CALLS_MAX_FRACTION) as usize;
5312        let calls_reserve = precise_calls_bytes.max(fraction_floor).min(fraction_ceil);
5313        let symbols_budget = FILE_ENVELOPE_MIN_BYTES + post_envelope.saturating_sub(calls_reserve);
5314        let mut used: usize = FILE_ENVELOPE_MIN_BYTES; // envelope cost
5315
5316        // ── Symbols ──────────────────────────────────────────────────
5317        // Retrieve def-level ranks for this file via the offset table.
5318        let def_count = file.defs.len();
5319        let def_offset = if file_idx < graph.def_offsets.len() {
5320            graph.def_offsets[file_idx]
5321        } else {
5322            0
5323        };
5324
5325        // Build (def_idx, rank, kind_priority, start_byte) tuples. We sort
5326        // by a composite key: AST kind priority (descending) — putting types
5327        // before functions before fields — then by def_rank (descending)
5328        // within each tier. This is Fix B (4.0.2): the def_rank distribution
5329        // is often degenerate (most defs share near-zero rank because the
5330        // call-edge extractor doesn't capture every dispatch), so we use
5331        // structural signal as the primary ordering and def_rank as the
5332        // within-tier tiebreaker. When def_rank IS informative, it dominates
5333        // *within* its kind tier and recovers the original behavior; the AST
5334        // signal only shifts ordering *between* tiers.
5335        let mut def_rank_pairs: Vec<(usize, f32, u32, u32)> = (0..def_count)
5336            .map(|di| {
5337                let flat = def_offset + di;
5338                let r = graph.def_ranks.get(flat).copied().unwrap_or(0.0);
5339                // Store the ORIGINAL ast_kind_priority in the tuple (used by the
5340                // per-tier attenuation loop below). The sort comparator uses
5341                // effective_priority (which may be higher due to 4.0.4 promotion)
5342                // to reorder hot defs ahead of cold type-tier defs, while the
5343                // attenuation tier tracker continues to use the original AST tier
5344                // so the existing per-tier cutoff behaviour is preserved.
5345                let kind_prio = ast_kind_priority(&file.defs[di].kind);
5346                let decl_order = file.defs[di].start_byte;
5347                (di, r, kind_prio, decl_order)
5348            })
5349            .collect();
5350        def_rank_pairs.sort_unstable_by(|a, b| {
5351            // Primary: effective priority (4.0.4: AST kind + corpus-rank promotion) descending.
5352            // Hot defs that exceed corpus-median thresholds are promoted above their
5353            // declared tier so they surface before cold type-tier defs.
5354            let eff_a = effective_priority(
5355                &file.defs[a.0].kind,
5356                a.1,
5357                promo_1_threshold,
5358                promo_2_threshold,
5359            );
5360            let eff_b = effective_priority(
5361                &file.defs[b.0].kind,
5362                b.1,
5363                promo_1_threshold,
5364                promo_2_threshold,
5365            );
5366            eff_b
5367                .cmp(&eff_a)
5368                // Secondary: def_rank descending within tier.
5369                .then_with(|| b.1.total_cmp(&a.1))
5370                // Tertiary: earlier declaration order (stable, deterministic).
5371                .then_with(|| a.3.cmp(&b.3))
5372        });
5373
5374        let top_def_rank = def_rank_pairs.first().map(|&(_, r, _, _)| r).unwrap_or(0.0);
5375
5376        let mut symbols: Vec<RepoMapSymbol> = Vec::new();
5377        let mut truncated_symbols: usize = 0;
5378
5379        // Track per-tier position for the attenuation cutoff. When AST kind
5380        // priority changes (we've moved from types to functions, say), reset
5381        // the position so the attenuation curve restarts. Otherwise a
5382        // structurally-equivalent-but-later tier would be unfairly cut.
5383        let mut tier_pos: usize = 0;
5384        let mut current_tier: Option<u32> = None;
5385        let mut tier_top_rank: f32 = top_def_rank;
5386
5387        for (pos, &(di, def_r, kind_prio, _)) in def_rank_pairs.iter().enumerate() {
5388            // Reset attenuation at tier boundaries.
5389            if current_tier != Some(kind_prio) {
5390                current_tier = Some(kind_prio);
5391                tier_pos = 0;
5392                tier_top_rank = def_r;
5393            }
5394
5395            // Logarithmic attenuation cutoff, relative to the tier's top rank.
5396            let cutoff = if tier_top_rank > 0.0 {
5397                tier_top_rank / (1.0 + (tier_pos as f32 + 1.0).ln())
5398            } else {
5399                0.0
5400            };
5401            if def_r < cutoff {
5402                // Attenuation cuts the rest of THIS tier; we don't stop
5403                // entirely because the next tier may still have useful
5404                // content within its own attenuation curve. Skip this def.
5405                truncated_symbols += 1;
5406                tier_pos += 1;
5407                continue;
5408            }
5409
5410            let def = &file.defs[di];
5411            let sym_bytes = estimate_symbol_bytes(&def.name);
5412            // Use the reserved symbols sub-budget (Fix C) so calls aren't
5413            // starved when symbols would otherwise saturate budget_in.
5414            if used + sym_bytes > symbols_budget {
5415                truncated_symbols += def_rank_pairs.len() - pos;
5416                break;
5417            }
5418
5419            // C2 (4.1.1): Use the AST-computed lsp_kind_hint when available
5420            // (populated at parse time for Python decorated_definition nodes).
5421            // Fall back to the AST-less string mapping for all other kinds.
5422            let kind = def
5423                .lsp_kind_hint
5424                .unwrap_or_else(|| crate::languages::lsp_symbol_kind_for_node_kind(&def.kind));
5425            let line_0 = def.start_line.saturating_sub(1) as usize;
5426            symbols.push(RepoMapSymbol {
5427                name: def.name.clone(),
5428                kind,
5429                lsp_location: RepoMapLspLocation {
5430                    file_path: file_path_lsp.file_path.clone(),
5431                    start_line: line_0,
5432                    start_character: 0,
5433                    end_line: line_0,
5434                    end_character: 0,
5435                },
5436                rank: def_r,
5437            });
5438            used += sym_bytes;
5439            tier_pos += 1;
5440        }
5441
5442        // ── Calls ─────────────────────────────────────────────────────
5443        // `callee_indices` was pre-computed above (before the budget split) for
5444        // the precise-reserve calculation. Re-use it here.
5445        let call_total = callee_indices.len();
5446
5447        let mut calls: Vec<RepoMapCall> = Vec::new();
5448        let mut truncated_calls: usize = 0;
5449
5450        for (pos, &ci) in callee_indices.iter().enumerate() {
5451            let callee_rank = graph.base_ranks.get(ci).copied().unwrap_or(0.0);
5452
5453            // Hard render-time cap (I#68, 4.1.4): stop once we have rendered
5454            // MAX_FILE_CALLS entries. This is symmetric with the MAX_NEIGHBORS
5455            // graph-build cap (I#60) and replaces the old logarithmic attenuation
5456            // cutoff. Attenuation was correct for *symbols* (rank distributions
5457            // are informative) but pathological for *call edges*: in real corpora
5458            // callee base-ranks follow a geometric distribution, causing attenuation
5459            // to fire at pos=1 and collapse calls[] to a single entry.
5460            if calls.len() >= MAX_FILE_CALLS {
5461                truncated_calls += call_total - pos;
5462                break;
5463            }
5464
5465            let callee_path = &graph.files[ci].path;
5466            let call_bytes = estimate_call_bytes(callee_path);
5467            if used + call_bytes > budget_in {
5468                truncated_calls += call_total - pos;
5469                break;
5470            }
5471
5472            calls.push(RepoMapCall {
5473                lsp_location: file_lsp_location(callee_path),
5474                rank: callee_rank,
5475            });
5476            used += call_bytes;
5477        }
5478
5479        // Carry unused bytes forward to the next file.
5480        leftover = budget_in.saturating_sub(used);
5481
5482        result_files.push(RepoMapFile {
5483            lsp_location: file_path_lsp,
5484            rank: file_rank,
5485            content_kind: content_kind_tag(content_kind_for_path(&file.path)),
5486            calls,
5487            symbols,
5488            truncated_symbols,
5489            truncated_calls,
5490        });
5491    }
5492
5493    let estimated_bytes = serde_json::to_string(&result_files)
5494        .map(|s| s.len())
5495        .unwrap_or(0);
5496
5497    let budget_exhausted = total_files > result_files.len();
5498
5499    GetRepoMapResponse {
5500        files: result_files,
5501        total_files,
5502        estimated_bytes,
5503        budget_bytes: budget_total_bytes,
5504        budget_exhausted,
5505        capped: budget_exhausted,
5506    }
5507}
5508
5509/// Render a `PageRank`-sorted JSON map of the repository (4.0.0 compatibility shim).
5510///
5511/// This function wraps [`render_json_budgeted`] with a synthetic token budget
5512/// derived from `max_files * 2000` (a generous per-file allowance). It exists
5513/// to keep the existing D1/D2 unit tests compiling without change; the MCP
5514/// layer calls [`render_json_budgeted`] directly in 4.0.1.
5515///
5516/// The `capped` field in the response reflects whether the budget was
5517/// exhausted before all `eligible` files were included, which is equivalent
5518/// to the previous `total_files > max_files` check.
5519///
5520/// When `include_metadata` is `false` (default), files whose extension
5521/// classifies as [`ContentKind::Meta`] are excluded before ranking.
5522#[must_use]
5523pub fn render_json(
5524    graph: &RepoGraph,
5525    max_files: usize,
5526    focus: Option<usize>,
5527    include_metadata: bool,
5528) -> GetRepoMapResponse {
5529    // Synthesise a generous token budget: 2000 tokens per requested file.
5530    // This ensures the existing D1/D2 tests (which pass small max_files values
5531    // like 3, 5, 50) see the same file-count behaviour they expect. The test
5532    // assertions check file counts, not byte sizes, so the exact budget value
5533    // only matters for ensuring enough headroom.
5534    let token_budget = max_files.saturating_mul(2000);
5535    render_json_budgeted(graph, token_budget, focus, include_metadata)
5536}
5537
5538// ── Tests ────────────────────────────────────────────────────────────
5539
5540#[cfg(test)]
5541mod tests {
5542    use super::*;
5543
5544    #[test]
5545    fn test_pagerank_simple() {
5546        // 3-node graph: 0 -> 1 -> 2, 2 -> 0 (cycle)
5547        let edges = vec![(0, 1, 1), (1, 2, 1), (2, 0, 1)];
5548        let ranks = pagerank(3, &edges, None);
5549
5550        // All nodes in a symmetric cycle should have equal rank
5551        assert_eq!(ranks.len(), 3);
5552        let sum: f32 = ranks.iter().sum();
5553        assert!(
5554            (sum - 1.0).abs() < 0.01,
5555            "ranks should sum to ~1.0, got {sum}"
5556        );
5557
5558        // In a perfect cycle, all ranks should be approximately equal
5559        let expected = 1.0 / 3.0;
5560        for (i, &r) in ranks.iter().enumerate() {
5561            assert!(
5562                (r - expected).abs() < 0.05,
5563                "rank[{i}] = {r}, expected ~{expected}"
5564            );
5565        }
5566    }
5567
5568    #[test]
5569    fn test_pagerank_star() {
5570        // Star graph: 0,1,2 all point to 3
5571        let edges = vec![(0, 3, 1), (1, 3, 1), (2, 3, 1)];
5572        let ranks = pagerank(4, &edges, None);
5573
5574        assert_eq!(ranks.len(), 4);
5575        // Node 3 should have the highest rank
5576        let max_idx = ranks
5577            .iter()
5578            .enumerate()
5579            .max_by(|a, b| a.1.total_cmp(b.1))
5580            .unwrap()
5581            .0;
5582        assert_eq!(max_idx, 3, "node 3 should have highest rank");
5583        assert!(
5584            ranks[3] > ranks[0],
5585            "rank[3]={} should be > rank[0]={}",
5586            ranks[3],
5587            ranks[0]
5588        );
5589    }
5590
5591    #[test]
5592    fn test_pagerank_topic_sensitive() {
5593        // 10-node chain: 0 -> 1 -> ... -> 9.
5594        //
5595        // With PERSONALIZATION_ALPHA = 0.15 and n = 10, the uniform share per
5596        // node is 1/10 = 0.10.  The focus node (0) gets 0.15 teleportation
5597        // mass vs 0.10 uniform, so focused rank[0] > uniform rank[0] holds.
5598        //
5599        // The 3-node chain used previously broke when alpha was reduced from
5600        // 0.70 to 0.15 because 0.15 < 1/3 = 0.33 for n=3 — the focus node
5601        // received *less* teleportation than its uniform share, inverting the
5602        // expected direction.  Using n=10 avoids this edge case while still
5603        // testing the personalization effect.
5604        let n = 10_usize;
5605        #[expect(clippy::cast_possible_truncation, reason = "test: n << u32::MAX")]
5606        let edges: Vec<(u32, u32, u32)> = (0..(n - 1))
5607            .map(|i| (i as u32, (i + 1) as u32, 1_u32))
5608            .collect();
5609        let uniform_ranks = pagerank(n, &edges, None);
5610        let biased_ranks = pagerank(n, &edges, Some(0));
5611
5612        // With focus on node 0, it should get a higher rank than uniform
5613        // because PERSONALIZATION_ALPHA (0.15) > 1/n (0.10) for n=10.
5614        assert!(
5615            biased_ranks[0] > uniform_ranks[0],
5616            "focused rank[0]={} should be > uniform rank[0]={}",
5617            biased_ranks[0],
5618            uniform_ranks[0]
5619        );
5620    }
5621
5622    // ── J1 tests — topic-sensitive PageRank soft personalization ─────────
5623
5624    /// J1 RED: `focus_file` PageRank must not collapse other-file ranks.
5625    ///
5626    /// Baseline (pre-4.0.5) concentrated 70% mass on the focus node, producing
5627    /// a degenerate Dirac delta: focus rank ≈ 0.703, all others ≈ 0.003.
5628    /// This test fails on the baseline and must pass after the fix.
5629    ///
5630    /// Invariant: with `PERSONALIZATION_ALPHA = 0.15`, focus node gets 0.15 of
5631    /// teleportation mass and each of the other (n-1) nodes gets 0.85/(n-1).
5632    /// On a star graph with n=10 nodes, the focus node rank must NOT be more
5633    /// than 40× the average non-focus rank.  The 4.0.5 fix targets roughly
5634    /// 5-10× for a well-connected graph, so 40× is a conservative upper bound
5635    /// that the baseline (≈200×) fails.
5636    #[test]
5637    fn test_focus_file_topic_pagerank_preserves_rank_dispersion() {
5638        // Star graph: nodes 1..9 all point to node 0 (high natural rank).
5639        // Focus on node 1 (low natural rank) to test personalization effect.
5640        let n = 10_usize;
5641        #[expect(clippy::cast_possible_truncation, reason = "test: n << u32::MAX")]
5642        let edges: Vec<(u32, u32, u32)> = (1..n).map(|i| (i as u32, 0_u32, 1_u32)).collect();
5643
5644        let ranks_focused = pagerank(n, &edges, Some(1));
5645
5646        let focus_rank = ranks_focused[1];
5647        let sum_non_focus: f32 = ranks_focused
5648            .iter()
5649            .enumerate()
5650            .filter(|&(i, _)| i != 1)
5651            .map(|(_, &r)| r)
5652            .sum();
5653        let n_non_focus = (n - 1) as f32;
5654        let avg_non_focus = sum_non_focus / n_non_focus;
5655
5656        let dispersion_ratio = focus_rank / avg_non_focus;
5657
5658        eprintln!(
5659            "J1 dispersion: focus_rank={focus_rank:.6}, avg_non_focus={avg_non_focus:.6}, \
5660             ratio={dispersion_ratio:.2}× (must be <= 40×)"
5661        );
5662
5663        // With 0.15 personalization alpha the focus node's teleportation
5664        // advantage is modest; 40× is an upper bound the old 0.70 code violates.
5665        assert!(
5666            dispersion_ratio <= 40.0,
5667            "focus rank is {dispersion_ratio:.1}× avg non-focus rank (must be ≤ 40×); \
5668             pre-fix baseline was ~200× due to 70% concentration — I#16"
5669        );
5670
5671        // Ranks must still sum to ~1.
5672        let total: f32 = ranks_focused.iter().sum();
5673        assert!(
5674            (total - 1.0).abs() < 0.01,
5675            "ranks must sum to ≈1.0; got {total}"
5676        );
5677    }
5678
5679    /// J1 RED: focus node must have the highest rank (it still gets the bias),
5680    /// but non-focus nodes must NOT collapse to a flat floor.
5681    ///
5682    /// Concretely: the second-highest-ranked file must be ≥ 10% of the focus
5683    /// file's rank (neighborhood rebiasing, not winner-take-all).
5684    #[test]
5685    fn test_focus_file_topic_pagerank_does_not_collapse_other_files() {
5686        // Linear chain: 0 → 1 → 2 → ... → 9 (directed).
5687        // Focus on node 0.  Without personalization, ranks decrease along the
5688        // chain.  With soft personalization the non-focus nodes stay non-trivial.
5689        let n = 10_usize;
5690        #[expect(clippy::cast_possible_truncation, reason = "test: n << u32::MAX")]
5691        let edges: Vec<(u32, u32, u32)> = (0..(n - 1))
5692            .map(|i| (i as u32, (i + 1) as u32, 1_u32))
5693            .collect();
5694
5695        let ranks = pagerank(n, &edges, Some(0));
5696
5697        let focus_rank = ranks[0];
5698        // All non-focus ranks must be ≥ 10% of focus rank.
5699        for (i, &r) in ranks.iter().enumerate().skip(1) {
5700            assert!(
5701                r >= focus_rank * 0.10,
5702                "rank[{i}] = {r:.6} is < 10% of focus rank {focus_rank:.6}; \
5703                 non-focus files must not collapse to near-zero (I#16)"
5704            );
5705        }
5706    }
5707
5708    // ── J2 tests — neighborhood count parity ─────────────────────────────
5709
5710    /// J2 RED: `render_json_budgeted` with `focus=Some(i)` must return at
5711    /// least 70% as many files as the unfocused call with the same budget.
5712    ///
5713    /// Baseline (pre-4.0.5, α=0.70) collapsed the focused run to 1 dominant
5714    /// file + a flat tail — that's what this test was originally written to
5715    /// catch. The threshold was 80% when α was 0.15; raising α to 0.35
5716    /// (4.1.12, to satisfy the flask focus-in-top-3 invariant) pulls budget
5717    /// toward the focus neighborhood and drops file count to ~70% on small
5718    /// synthetic stars. 70% remains far from the "1 file + zero tail"
5719    /// degenerate baseline this test guards against.
5720    #[test]
5721    fn test_focus_file_returns_neighborhood_not_just_focus() {
5722        // Build a 12-file star graph with meaningful rank variation.
5723        let n = 12_usize;
5724        #[expect(clippy::cast_possible_truncation, reason = "test: n << u32::MAX")]
5725        let edges: Vec<(u32, u32, u32)> = (1..n).map(|i| (i as u32, 0_u32, 1_u32)).collect();
5726        let base_ranks = pagerank(n, &edges, None);
5727        let (callers, callees) = build_neighbor_lists(n, &edges);
5728
5729        let file_nodes: Vec<FileNode> = (0..n)
5730            .map(|i| FileNode {
5731                path: format!("src/file_{i}.rs"),
5732                defs: vec![Definition {
5733                    name: format!("func_{i}"),
5734                    kind: "function_item".to_string(),
5735                    start_line: 1,
5736                    end_line: 5,
5737                    scope: String::new(),
5738                    signature: Some(format!("fn func_{i}() -> i32")),
5739                    start_byte: 0,
5740                    end_byte: 100,
5741                    calls: vec![],
5742                    decorator: None,
5743                    lsp_kind_hint: None,
5744                }],
5745                imports: vec![],
5746            })
5747            .collect();
5748
5749        let graph = RepoGraph {
5750            files: file_nodes,
5751            edges,
5752            base_ranks,
5753            callers,
5754            callees,
5755            def_edges: vec![],
5756            def_ranks: vec![],
5757            def_callers: vec![],
5758            def_callees: vec![],
5759            def_offsets: vec![0],
5760            alpha: 0.5,
5761        };
5762
5763        let budget = 2000; // generous budget; all 12 files should fit
5764        let unfocused = render_json_budgeted(&graph, budget, None, false);
5765        let focused = render_json_budgeted(&graph, budget, Some(1), false);
5766
5767        let unfocused_n = unfocused.files.len();
5768        let focused_n = focused.files.len();
5769        #[expect(
5770            clippy::cast_possible_truncation,
5771            clippy::cast_sign_loss,
5772            reason = "unfocused_n is a file count (small, positive); f32 multiplication \
5773                      by 0.80 and ceil produce a value in [0, n]; truncation to usize is safe"
5774        )]
5775        let min_expected = (unfocused_n as f32 * 0.70).ceil() as usize;
5776
5777        eprintln!(
5778            "J2 neighborhood: unfocused={unfocused_n} files, focused={focused_n} files \
5779             (need ≥ {min_expected})"
5780        );
5781
5782        assert!(
5783            focused_n >= min_expected,
5784            "focused call returned {focused_n} files; expected ≥ {min_expected} \
5785             (70% of unfocused {unfocused_n}); soft personalization must preserve \
5786             rank dispersion across files (I#16/J2)"
5787        );
5788    }
5789
5790    /// J2 RED: topic delta fingerprinting — focused run must reorder files
5791    /// relative to unfocused run (focus file surfaces near top), but both
5792    /// must contain similar total file counts.
5793    #[test]
5794    fn test_focus_delta_topic_fingerprinting_works() {
5795        // Bidirectional 8-file ring so all nodes are structurally equivalent.
5796        // Without focus all ranks are equal.  With focus on node 3, node 3
5797        // must surface as the highest-ranked file.
5798        let n = 8_usize;
5799        #[expect(clippy::cast_possible_truncation, reason = "test: n << u32::MAX")]
5800        let edges: Vec<(u32, u32, u32)> = (0..n)
5801            .flat_map(|i| {
5802                let next = ((i + 1) % n) as u32;
5803                let curr = i as u32;
5804                [(curr, next, 1_u32), (next, curr, 1_u32)]
5805            })
5806            .collect();
5807
5808        let ranks_uniform = pagerank(n, &edges, None);
5809        let ranks_focused = pagerank(n, &edges, Some(3));
5810
5811        // Focus node must have highest rank.
5812        let top_idx = ranks_focused
5813            .iter()
5814            .enumerate()
5815            .max_by(|a, b| a.1.total_cmp(b.1))
5816            .map(|(i, _)| i)
5817            .unwrap();
5818
5819        assert_eq!(
5820            top_idx, 3,
5821            "with focus=Some(3), node 3 must have highest rank; top was {top_idx}"
5822        );
5823
5824        // Uniform baseline: all ranks should be approximately equal.
5825        let uniform_max = ranks_uniform
5826            .iter()
5827            .copied()
5828            .fold(f32::NEG_INFINITY, f32::max);
5829        let uniform_min = ranks_uniform.iter().copied().fold(f32::INFINITY, f32::min);
5830        assert!(
5831            (uniform_max - uniform_min).abs() < 0.01,
5832            "on a ring without focus all ranks should be ≈equal; max={uniform_max:.6} min={uniform_min:.6}"
5833        );
5834
5835        // Focused run must rank the focus node significantly higher than others
5836        // but others must remain non-trivial (≥ 5% of focus).
5837        let focus_rank = ranks_focused[3];
5838        for (i, &r) in ranks_focused.iter().enumerate().filter(|&(i, _)| i != 3) {
5839            assert!(
5840                r >= focus_rank * 0.05,
5841                "rank[{i}]={r:.6} is < 5% of focus rank {focus_rank:.6}; \
5842                 soft personalization must preserve non-focus ranks"
5843            );
5844        }
5845    }
5846
5847    // ── T1 tests — focus_file resolver normalization (I#20) ──────────────
5848
5849    /// Build a tiny synthetic graph whose `FileNode::path` values match the
5850    /// shape `build_graph` produces on disk (no leading `./`, forward slashes).
5851    fn focus_resolver_graph() -> RepoGraph {
5852        let file_nodes: Vec<FileNode> = vec![
5853            FileNode {
5854                path: "device_opt/services/storage.py".to_string(),
5855                defs: vec![],
5856                imports: vec![],
5857            },
5858            FileNode {
5859                path: "device_opt/ui/textual/screens/settings.py".to_string(),
5860                defs: vec![],
5861                imports: vec![],
5862            },
5863            FileNode {
5864                path: "device_opt/services/registry.py".to_string(),
5865                defs: vec![],
5866                imports: vec![],
5867            },
5868            FileNode {
5869                path: "tests/test_storage.py".to_string(),
5870                defs: vec![],
5871                imports: vec![],
5872            },
5873        ];
5874        let n = file_nodes.len();
5875        RepoGraph {
5876            files: file_nodes,
5877            edges: vec![],
5878            base_ranks: vec![1.0 / n as f32; n],
5879            callers: vec![vec![]; n],
5880            callees: vec![vec![]; n],
5881            def_edges: vec![],
5882            def_ranks: vec![],
5883            def_callers: vec![],
5884            def_callees: vec![],
5885            def_offsets: vec![0; n + 1],
5886            alpha: 0.5,
5887        }
5888    }
5889
5890    /// T1: focus_file paths emitted by `lsp_location.file_path` (with the
5891    /// `./` prefix) must resolve to the correct file index.
5892    ///
5893    /// Baseline reproduction (mnemosyne corpus, 4.0.5): passing
5894    /// `focus_file="./device_opt/.../settings.py"` produced rank values
5895    /// bit-identical to the unfocused call because the strict-suffix matcher
5896    /// in `tools.rs` failed both the `exact` and the `strict_suffix` checks
5897    /// when the focus carried the LSP `./` prefix. The matcher silently
5898    /// returned `focus = None`, masking the failure as "topic-sensitive
5899    /// PageRank does nothing on Python".
5900    #[test]
5901    fn test_focus_file_resolver_accepts_lsp_location_path() {
5902        let g = focus_resolver_graph();
5903        // LSP-shaped path with leading `./` — the form documented in
5904        // get_repo_map's instructions.
5905        let res = g.resolve_focus_file("./device_opt/ui/textual/screens/settings.py");
5906        match res {
5907            FocusResolution::Found(idx) => {
5908                assert_eq!(
5909                    g.files[idx].path, "device_opt/ui/textual/screens/settings.py",
5910                    "resolver must accept the ./-prefixed LSP path form (I#20)"
5911                );
5912            }
5913            FocusResolution::NotFound | FocusResolution::Ambiguous(_) => {
5914                panic!(
5915                    "resolver returned {res:?} for ./device_opt/ui/textual/screens/settings.py; \
5916                     the LSP-shaped path form must resolve to exactly one file (I#20)"
5917                );
5918            }
5919        }
5920    }
5921
5922    /// T1: the bare stored path (no `./`) must continue to resolve.
5923    /// Regression guard for the pre-fix matcher's "exact" path.
5924    #[test]
5925    fn test_focus_file_resolver_accepts_bare_stored_path() {
5926        let g = focus_resolver_graph();
5927        let res = g.resolve_focus_file("device_opt/services/storage.py");
5928        match res {
5929            FocusResolution::Found(idx) => {
5930                assert_eq!(g.files[idx].path, "device_opt/services/storage.py");
5931            }
5932            other => panic!("expected Found, got {other:?}"),
5933        }
5934    }
5935
5936    /// T1: strict-suffix match — `storage.py` must match
5937    /// `device_opt/services/storage.py` (prev char is `/`) but ambiguity
5938    /// (two `storage.py` files) must be reported, not silently picked.
5939    #[test]
5940    fn test_focus_file_resolver_strict_suffix_and_ambiguity() {
5941        let g = focus_resolver_graph();
5942        // "storage.py" matches both device_opt/services/storage.py and
5943        // tests/test_storage.py? No — test_storage.py has `_` before `storage.py`
5944        // (not `/`), so the strict-suffix matcher rejects it. Only one match.
5945        let res = g.resolve_focus_file("storage.py");
5946        assert!(
5947            matches!(res, FocusResolution::Found(_)),
5948            "strict-suffix `storage.py` must match exactly one file (the `_` in \
5949             test_storage.py blocks the strict-suffix), got {res:?}"
5950        );
5951        // Add a second services/storage.py-shaped file to force ambiguity.
5952        let mut g2 = g.clone();
5953        g2.files.push(FileNode {
5954            path: "vendored/services/storage.py".to_string(),
5955            defs: vec![],
5956            imports: vec![],
5957        });
5958        g2.base_ranks.push(0.0);
5959        g2.callers.push(vec![]);
5960        g2.callees.push(vec![]);
5961        g2.def_offsets.push(*g2.def_offsets.last().unwrap());
5962        let res = g2.resolve_focus_file("storage.py");
5963        match res {
5964            FocusResolution::Ambiguous(cands) => {
5965                assert_eq!(cands.len(), 2, "expected two candidates, got {cands:?}");
5966            }
5967            other => panic!("expected Ambiguous, got {other:?}"),
5968        }
5969    }
5970
5971    /// T1: a focus that matches no file returns `NotFound`. The caller
5972    /// is responsible for either treating this as unfocused or surfacing
5973    /// an error — the resolver itself does not impose policy.
5974    #[test]
5975    fn test_focus_file_resolver_not_found() {
5976        let g = focus_resolver_graph();
5977        let res = g.resolve_focus_file("./does/not/exist.py");
5978        assert!(
5979            matches!(res, FocusResolution::NotFound),
5980            "expected NotFound, got {res:?}"
5981        );
5982    }
5983
5984    /// T1: empty focus does not match anything (avoids the empty-suffix
5985    /// degenerate that would otherwise match every file).
5986    #[test]
5987    fn test_focus_file_resolver_empty_input_is_not_found() {
5988        let g = focus_resolver_graph();
5989        let res = g.resolve_focus_file("");
5990        assert!(
5991            matches!(res, FocusResolution::NotFound),
5992            "empty focus must not match anything, got {res:?}"
5993        );
5994    }
5995
5996    /// T1: focus_file rank delta must be visible on a Python-shaped
5997    /// synthetic graph.
5998    ///
5999    /// Builds a small Python-style call graph (FileNode + Definition with
6000    /// resolved CallRefs, matching what `extract_calls` produces on a real
6001    /// Python corpus), runs `build_graph_from_files_pub` to get a
6002    /// `RepoGraph`, then calls `render_json_budgeted` with and without
6003    /// focus. Asserts that the focused call changes the rank of at least
6004    /// one non-focus file by ≥ 5% in either direction.
6005    ///
6006    /// On the baseline (pre-T1) this test passes when the caller supplies
6007    /// an int focus_idx directly — the engine's topic-sensitive PageRank is
6008    /// correct. The bug was at the string-to-int resolver layer in
6009    /// `tools.rs`, which silently masked the failure as "the rendering
6010    /// path doesn't propagate focus". This test locks the engine's
6011    /// behavior so a future regression in the rendering path is caught.
6012    #[test]
6013    #[expect(
6014        clippy::too_many_lines,
6015        reason = "synthetic Python-shaped graph (five FileNodes with defs + \
6016                  CallRefs + ImportRefs) plus the two-call assertion sequence \
6017                  is inherently long; splitting into helpers would obscure the \
6018                  one-shot reproduction the test is locking in."
6019    )]
6020    fn test_focus_file_rank_delta_visible_on_python_corpus() {
6021        // Five files, Python-shaped: services/storage.py (a "hub" that two
6022        // UI files call into) plus a tests/ file. The Python tree-sitter
6023        // extractor produces `class_definition` and `function_definition`
6024        // kinds with resolved CallRefs pointing at the hub.
6025        let mut files: Vec<FileNode> = vec![
6026            FileNode {
6027                path: "device_opt/services/storage.py".to_string(),
6028                defs: vec![
6029                    Definition {
6030                        name: "ScanStore".to_string(),
6031                        kind: "class_definition".to_string(),
6032                        start_line: 1,
6033                        end_line: 80,
6034                        scope: String::new(),
6035                        signature: None,
6036                        start_byte: 0,
6037                        end_byte: 2000,
6038                        calls: vec![],
6039                        decorator: None,
6040                        lsp_kind_hint: None,
6041                    },
6042                    Definition {
6043                        name: "save_scan".to_string(),
6044                        kind: "function_definition".to_string(),
6045                        start_line: 20,
6046                        end_line: 40,
6047                        scope: "class_definition ScanStore".to_string(),
6048                        signature: Some("def save_scan(self, scan)".to_string()),
6049                        start_byte: 200,
6050                        end_byte: 600,
6051                        calls: vec![],
6052                        decorator: None,
6053                        lsp_kind_hint: None,
6054                    },
6055                ],
6056                imports: vec![],
6057            },
6058            FileNode {
6059                path: "device_opt/services/registry.py".to_string(),
6060                defs: vec![Definition {
6061                    name: "register".to_string(),
6062                    kind: "function_definition".to_string(),
6063                    start_line: 1,
6064                    end_line: 30,
6065                    scope: String::new(),
6066                    signature: Some("def register(svc)".to_string()),
6067                    start_byte: 0,
6068                    end_byte: 600,
6069                    calls: vec![CallRef {
6070                        name: "save_scan".to_string(),
6071                        qualified_path: None,
6072                        receiver_type: None,
6073                        byte_offset: 100,
6074                        resolved: None,
6075                    }],
6076                    decorator: None,
6077                    lsp_kind_hint: None,
6078                }],
6079                imports: vec![ImportRef {
6080                    raw_path: "from device_opt.services import storage".to_string(),
6081                    resolved_idx: Some(0),
6082                }],
6083            },
6084            FileNode {
6085                path: "device_opt/ui/screens/browse.py".to_string(),
6086                defs: vec![Definition {
6087                    name: "browse_scans".to_string(),
6088                    kind: "function_definition".to_string(),
6089                    start_line: 1,
6090                    end_line: 50,
6091                    scope: String::new(),
6092                    signature: Some("def browse_scans(app)".to_string()),
6093                    start_byte: 0,
6094                    end_byte: 1000,
6095                    calls: vec![CallRef {
6096                        name: "save_scan".to_string(),
6097                        qualified_path: None,
6098                        receiver_type: None,
6099                        byte_offset: 200,
6100                        resolved: None,
6101                    }],
6102                    decorator: None,
6103                    lsp_kind_hint: None,
6104                }],
6105                imports: vec![ImportRef {
6106                    raw_path: "from device_opt.services import storage".to_string(),
6107                    resolved_idx: Some(0),
6108                }],
6109            },
6110            FileNode {
6111                path: "device_opt/ui/screens/settings.py".to_string(),
6112                defs: vec![Definition {
6113                    name: "open_settings".to_string(),
6114                    kind: "function_definition".to_string(),
6115                    start_line: 1,
6116                    end_line: 40,
6117                    scope: String::new(),
6118                    signature: Some("def open_settings(app)".to_string()),
6119                    start_byte: 0,
6120                    end_byte: 800,
6121                    calls: vec![CallRef {
6122                        name: "register".to_string(),
6123                        qualified_path: None,
6124                        receiver_type: None,
6125                        byte_offset: 150,
6126                        resolved: None,
6127                    }],
6128                    decorator: None,
6129                    lsp_kind_hint: None,
6130                }],
6131                imports: vec![ImportRef {
6132                    raw_path: "from device_opt.services import registry".to_string(),
6133                    resolved_idx: Some(1),
6134                }],
6135            },
6136            FileNode {
6137                path: "tests/test_storage.py".to_string(),
6138                defs: vec![Definition {
6139                    name: "test_save".to_string(),
6140                    kind: "function_definition".to_string(),
6141                    start_line: 1,
6142                    end_line: 20,
6143                    scope: String::new(),
6144                    signature: Some("def test_save()".to_string()),
6145                    start_byte: 0,
6146                    end_byte: 400,
6147                    calls: vec![CallRef {
6148                        name: "save_scan".to_string(),
6149                        qualified_path: None,
6150                        receiver_type: None,
6151                        byte_offset: 50,
6152                        resolved: None,
6153                    }],
6154                    decorator: None,
6155                    lsp_kind_hint: None,
6156                }],
6157                imports: vec![ImportRef {
6158                    raw_path: "from device_opt.services import storage".to_string(),
6159                    resolved_idx: Some(0),
6160                }],
6161            },
6162        ];
6163
6164        // Resolve calls so the graph builder has edges to chew on.
6165        let def_index = build_def_index(&files);
6166        resolve_calls(&mut files, &def_index, &HashMap::new());
6167        let graph = build_graph_from_files_pub(files);
6168
6169        // Sanity: the graph must have edges (the calls were resolved).
6170        assert!(
6171            !graph.edges.is_empty(),
6172            "Python-shaped synthetic graph must produce file-level edges; got 0. \
6173             The CallRefs may have failed to resolve."
6174        );
6175
6176        // Resolve the focus file via the new helper.
6177        let focus_idx = match graph.resolve_focus_file("./device_opt/ui/screens/settings.py") {
6178            FocusResolution::Found(i) => i,
6179            other => panic!("resolver must find settings.py via LSP-shaped path, got {other:?}"),
6180        };
6181
6182        let budget = 4000;
6183        let unfocused = render_json_budgeted(&graph, budget, None, false);
6184        let focused = render_json_budgeted(&graph, budget, Some(focus_idx), false);
6185
6186        // Collect rank-by-path maps for both runs.
6187        let unfocused_ranks: std::collections::HashMap<String, f32> = unfocused
6188            .files
6189            .iter()
6190            .map(|f| (f.lsp_location.file_path.clone(), f.rank))
6191            .collect();
6192        let focused_ranks: std::collections::HashMap<String, f32> = focused
6193            .files
6194            .iter()
6195            .map(|f| (f.lsp_location.file_path.clone(), f.rank))
6196            .collect();
6197
6198        eprintln!("T1 Python — unfocused ranks: {unfocused_ranks:#?}");
6199        eprintln!("T1 Python — focused ranks:   {focused_ranks:#?}");
6200
6201        // Find at least one non-focus file whose rank changed by ≥ 5% in
6202        // either direction. The threshold is conservative; the soft 0.15
6203        // personalization alpha redistributes mass enough that on this
6204        // 5-node graph the affected neighbors typically shift by 20%+.
6205        let focus_path = "./device_opt/ui/screens/settings.py";
6206        let mut max_delta_ratio = 0.0_f32;
6207        for (path, &u_rank) in &unfocused_ranks {
6208            if path == focus_path {
6209                continue;
6210            }
6211            if let Some(&f_rank) = focused_ranks.get(path)
6212                && u_rank > 0.0
6213            {
6214                let ratio = (f_rank - u_rank).abs() / u_rank;
6215                if ratio > max_delta_ratio {
6216                    max_delta_ratio = ratio;
6217                }
6218            }
6219        }
6220        assert!(
6221            max_delta_ratio >= 0.05,
6222            "focus_file must rebias non-focus file ranks by ≥ 5%; \
6223             max observed delta ratio = {max_delta_ratio:.3} \
6224             (I#20: focus_file invisible on Python corpora)"
6225        );
6226
6227        // Bit-identity guard: at least one non-focus file's rank must NOT
6228        // equal its unfocused value. This is the pathology from the
6229        // mnemosyne reproduction: every rank value was bit-identical
6230        // across global/focused calls.
6231        let any_changed = unfocused_ranks.iter().any(|(path, &u_rank)| {
6232            path != focus_path
6233                && focused_ranks
6234                    .get(path)
6235                    .is_some_and(|&f_rank| f_rank.to_bits() != u_rank.to_bits())
6236        });
6237        assert!(
6238            any_changed,
6239            "no non-focus file rank changed across focused/unfocused calls — \
6240             bit-identical pathology (I#20). unfocused={unfocused_ranks:#?} \
6241             focused={focused_ranks:#?}"
6242        );
6243    }
6244
6245    /// T1: focus_file rank delta on a Rust-shaped synthetic graph.
6246    ///
6247    /// Regression test: confirms the engine's topic-sensitive PageRank
6248    /// works on Rust shapes (where T1's investigation found it already
6249    /// works, but the resolver fix must not break the existing path).
6250    ///
6251    /// This complements `test_focus_file_returns_neighborhood_not_just_focus`
6252    /// by additionally checking that (a) the resolver accepts a Rust path
6253    /// with the `./` LSP prefix, and (b) at least one non-focus file's
6254    /// rank moves by ≥ 5%.
6255    #[test]
6256    #[expect(
6257        clippy::too_many_lines,
6258        reason = "synthetic Rust-shaped graph with four FileNodes plus the \
6259                  two-call assertion sequence inherently exceeds the 100-line \
6260                  cap; the test mirrors the Python-shaped sibling."
6261    )]
6262    fn test_focus_file_rank_delta_preserved_on_rust_corpus() {
6263        let mut files: Vec<FileNode> = vec![
6264            FileNode {
6265                path: "src/lib.rs".to_string(),
6266                defs: vec![Definition {
6267                    name: "Engine".to_string(),
6268                    kind: "struct_item".to_string(),
6269                    start_line: 1,
6270                    end_line: 30,
6271                    scope: String::new(),
6272                    signature: None,
6273                    start_byte: 0,
6274                    end_byte: 600,
6275                    calls: vec![],
6276                    decorator: None,
6277                    lsp_kind_hint: None,
6278                }],
6279                imports: vec![],
6280            },
6281            FileNode {
6282                path: "src/encoder/mod.rs".to_string(),
6283                defs: vec![Definition {
6284                    name: "encode".to_string(),
6285                    kind: "function_item".to_string(),
6286                    start_line: 1,
6287                    end_line: 40,
6288                    scope: String::new(),
6289                    signature: Some("fn encode(input: &str) -> Vec<f32>".to_string()),
6290                    start_byte: 0,
6291                    end_byte: 800,
6292                    calls: vec![],
6293                    decorator: None,
6294                    lsp_kind_hint: None,
6295                }],
6296                imports: vec![ImportRef {
6297                    raw_path: "use crate::lib;".to_string(),
6298                    resolved_idx: Some(0),
6299                }],
6300            },
6301            FileNode {
6302                path: "src/search.rs".to_string(),
6303                defs: vec![Definition {
6304                    name: "search".to_string(),
6305                    kind: "function_item".to_string(),
6306                    start_line: 1,
6307                    end_line: 30,
6308                    scope: String::new(),
6309                    signature: Some("fn search(q: &str) -> Hits".to_string()),
6310                    start_byte: 0,
6311                    end_byte: 600,
6312                    calls: vec![CallRef {
6313                        name: "encode".to_string(),
6314                        qualified_path: None,
6315                        receiver_type: None,
6316                        byte_offset: 100,
6317                        resolved: None,
6318                    }],
6319                    decorator: None,
6320                    lsp_kind_hint: None,
6321                }],
6322                imports: vec![ImportRef {
6323                    raw_path: "use crate::encoder;".to_string(),
6324                    resolved_idx: Some(1),
6325                }],
6326            },
6327            FileNode {
6328                path: "src/cli.rs".to_string(),
6329                defs: vec![Definition {
6330                    name: "main".to_string(),
6331                    kind: "function_item".to_string(),
6332                    start_line: 1,
6333                    end_line: 20,
6334                    scope: String::new(),
6335                    signature: Some("fn main()".to_string()),
6336                    start_byte: 0,
6337                    end_byte: 400,
6338                    calls: vec![CallRef {
6339                        name: "search".to_string(),
6340                        qualified_path: None,
6341                        receiver_type: None,
6342                        byte_offset: 50,
6343                        resolved: None,
6344                    }],
6345                    decorator: None,
6346                    lsp_kind_hint: None,
6347                }],
6348                imports: vec![ImportRef {
6349                    raw_path: "use crate::search;".to_string(),
6350                    resolved_idx: Some(2),
6351                }],
6352            },
6353        ];
6354
6355        let def_index = build_def_index(&files);
6356        resolve_calls(&mut files, &def_index, &HashMap::new());
6357        let graph = build_graph_from_files_pub(files);
6358
6359        assert!(
6360            !graph.edges.is_empty(),
6361            "Rust-shaped synthetic graph must produce edges"
6362        );
6363
6364        let focus_idx = match graph.resolve_focus_file("./src/encoder/mod.rs") {
6365            FocusResolution::Found(i) => i,
6366            other => panic!("resolver must find encoder/mod.rs via LSP path, got {other:?}"),
6367        };
6368
6369        let budget = 4000;
6370        let unfocused = render_json_budgeted(&graph, budget, None, false);
6371        let focused = render_json_budgeted(&graph, budget, Some(focus_idx), false);
6372
6373        let unfocused_ranks: std::collections::HashMap<String, f32> = unfocused
6374            .files
6375            .iter()
6376            .map(|f| (f.lsp_location.file_path.clone(), f.rank))
6377            .collect();
6378        let focused_ranks: std::collections::HashMap<String, f32> = focused
6379            .files
6380            .iter()
6381            .map(|f| (f.lsp_location.file_path.clone(), f.rank))
6382            .collect();
6383
6384        eprintln!("T1 Rust — unfocused: {unfocused_ranks:#?}");
6385        eprintln!("T1 Rust — focused:   {focused_ranks:#?}");
6386
6387        let focus_path = "./src/encoder/mod.rs";
6388        let mut max_delta_ratio = 0.0_f32;
6389        for (path, &u_rank) in &unfocused_ranks {
6390            if path == focus_path {
6391                continue;
6392            }
6393            if let Some(&f_rank) = focused_ranks.get(path)
6394                && u_rank > 0.0
6395            {
6396                let ratio = (f_rank - u_rank).abs() / u_rank;
6397                if ratio > max_delta_ratio {
6398                    max_delta_ratio = ratio;
6399                }
6400            }
6401        }
6402        assert!(
6403            max_delta_ratio >= 0.05,
6404            "focus_file must rebias non-focus file ranks by ≥ 5% on Rust shapes; \
6405             max observed delta = {max_delta_ratio:.3}"
6406        );
6407    }
6408
6409    #[test]
6410    fn test_pagerank_empty() {
6411        let ranks = pagerank(0, &[], None);
6412        assert!(ranks.is_empty());
6413    }
6414
6415    #[test]
6416    fn test_render_tiers() {
6417        // Build a small graph with 10 files to exercise all tiers
6418        let files: Vec<FileNode> = (0..10)
6419            .map(|i| FileNode {
6420                path: format!("src/file_{i}.rs"),
6421                defs: vec![Definition {
6422                    name: format!("func_{i}"),
6423                    kind: "function_item".to_string(),
6424                    start_line: 1,
6425                    end_line: 5,
6426                    scope: String::new(),
6427                    signature: Some(format!("func_{i}(x: i32) -> i32")),
6428                    start_byte: 0,
6429                    end_byte: 0,
6430                    calls: vec![],
6431                    decorator: None,
6432                    lsp_kind_hint: None,
6433                }],
6434                imports: vec![],
6435            })
6436            .collect();
6437
6438        // Create a star graph: files 1-9 all import from file 0
6439        let edges: Vec<(u32, u32, u32)> = (1..10).map(|i| (i, 0, 1)).collect();
6440        let base_ranks = pagerank(10, &edges, None);
6441        let (top_callers, top_callees) = build_neighbor_lists(10, &edges);
6442
6443        let graph = RepoGraph {
6444            files,
6445            edges,
6446            base_ranks,
6447            callers: top_callers,
6448            callees: top_callees,
6449            def_edges: vec![],
6450            def_ranks: vec![],
6451            def_callers: vec![],
6452            def_callees: vec![],
6453            def_offsets: vec![0],
6454            alpha: 0.5,
6455        };
6456
6457        // Large budget: should include all files
6458        let full = render(&graph, 10_000, None);
6459        assert!(
6460            full.contains("file_0"),
6461            "output should contain the top-ranked file"
6462        );
6463        // file_0 should appear as tier 0 (highest rank)
6464        assert!(
6465            full.contains("## src/file_0.rs"),
6466            "top file should have tier 0 heading"
6467        );
6468
6469        // Tiny budget: should only fit a few files
6470        let small = render(&graph, 10, None);
6471        assert!(
6472            !small.is_empty(),
6473            "even tiny budget should produce some output"
6474        );
6475        // Should have fewer entries than full render
6476        let full_lines = full.lines().count();
6477        let small_lines = small.lines().count();
6478        assert!(
6479            small_lines < full_lines,
6480            "small budget ({small_lines} lines) should have fewer lines than full ({full_lines})"
6481        );
6482    }
6483
6484    #[test]
6485    fn test_render_empty_graph() {
6486        let graph = RepoGraph {
6487            files: vec![],
6488            edges: vec![],
6489            base_ranks: vec![],
6490            callers: vec![],
6491            callees: vec![],
6492            def_edges: vec![],
6493            def_ranks: vec![],
6494            def_callers: vec![],
6495            def_callees: vec![],
6496            def_offsets: vec![0],
6497            alpha: 0.5,
6498        };
6499        let output = render(&graph, 1000, None);
6500        assert!(output.is_empty(), "empty graph should render empty string");
6501    }
6502
6503    #[test]
6504    fn test_build_graph_on_fixtures() {
6505        let fixtures = Path::new(env!("CARGO_MANIFEST_DIR"))
6506            .parent()
6507            .unwrap()
6508            .parent()
6509            .unwrap()
6510            .join("tests")
6511            .join("fixtures");
6512
6513        let graph = build_graph(&fixtures).expect("build_graph should succeed on fixtures");
6514
6515        // Should find at least the 3 fixture files
6516        assert!(
6517            !graph.files.is_empty(),
6518            "graph should contain files from fixtures"
6519        );
6520
6521        // Should find definitions in the Rust fixture
6522        let rs_file = graph.files.iter().find(|f| f.path.ends_with("sample.rs"));
6523        assert!(rs_file.is_some(), "should find sample.rs");
6524        let rs_file = rs_file.unwrap();
6525        assert!(
6526            !rs_file.defs.is_empty(),
6527            "sample.rs should have definitions"
6528        );
6529        assert!(
6530            rs_file.defs.iter().any(|d| d.name == "hello"),
6531            "should find 'hello' function in sample.rs"
6532        );
6533
6534        // Should find definitions in the Python fixture
6535        let py_file = graph.files.iter().find(|f| f.path.ends_with("sample.py"));
6536        assert!(py_file.is_some(), "should find sample.py");
6537        let py_file = py_file.unwrap();
6538        assert!(
6539            !py_file.defs.is_empty(),
6540            "sample.py should have definitions"
6541        );
6542        assert!(
6543            py_file.defs.iter().any(|d| d.name == "greet"),
6544            "should find 'greet' function in sample.py"
6545        );
6546
6547        // PageRank scores should be computed
6548        assert_eq!(graph.base_ranks.len(), graph.files.len());
6549        let sum: f32 = graph.base_ranks.iter().sum();
6550        assert!(
6551            (sum - 1.0).abs() < 0.01,
6552            "PageRank scores should sum to ~1.0, got {sum}"
6553        );
6554    }
6555
6556    #[test]
6557    fn test_extract_imports_rust() {
6558        let source = "use crate::foo::bar;\nuse std::collections::HashMap;\n";
6559        let (lang, query) = import_query_for_extension("rs").unwrap();
6560        let imports = extract_imports(source, &lang, &query);
6561        assert_eq!(imports.len(), 2);
6562        assert!(imports[0].contains("crate::foo::bar"));
6563    }
6564
6565    #[test]
6566    fn test_extract_imports_python_stub() {
6567        let source = "from typing import Protocol\nimport pkg.types\n";
6568        let (lang, query) = import_query_for_extension("pyi").unwrap();
6569        let imports = extract_imports(source, &lang, &query);
6570        assert_eq!(imports.len(), 2);
6571        assert!(imports[0].contains("from typing import Protocol"));
6572        assert!(imports[1].contains("import pkg.types"));
6573    }
6574
6575    #[test]
6576    fn test_resolve_python_import_to_stub_file() {
6577        let root = PathBuf::from("/project");
6578        let mut file_index = HashMap::new();
6579        file_index.insert(PathBuf::from("/project/pkg/types.pyi"), 1);
6580
6581        let result = resolve_python_import("import pkg.types", &root, &file_index);
6582        assert_eq!(result, Some(1));
6583    }
6584
6585    #[test]
6586    fn test_resolve_rust_crate_import() {
6587        let root = PathBuf::from("/project");
6588        let file_path = PathBuf::from("/project/src/main.rs");
6589        let mut file_index = HashMap::new();
6590        file_index.insert(PathBuf::from("/project/src/foo/bar.rs"), 1);
6591        file_index.insert(PathBuf::from("/project/src/main.rs"), 0);
6592
6593        let result = resolve_rust_import("use crate::foo::bar;", &file_path, &root, &file_index);
6594        assert_eq!(result, Some(1));
6595    }
6596
6597    #[test]
6598    fn test_resolve_rust_external_crate_dropped() {
6599        let root = PathBuf::from("/project");
6600        let file_path = PathBuf::from("/project/src/main.rs");
6601        let file_index = HashMap::new();
6602
6603        let result = resolve_rust_import(
6604            "use std::collections::HashMap;",
6605            &file_path,
6606            &root,
6607            &file_index,
6608        );
6609        assert_eq!(result, None, "external crate imports should be dropped");
6610    }
6611
6612    #[test]
6613    fn test_neighbor_lists() {
6614        // 0 -> 1, 0 -> 2, 1 -> 2
6615        let edges = vec![(0, 1, 1), (0, 2, 1), (1, 2, 1)];
6616        let (incoming, outgoing) = build_neighbor_lists(3, &edges);
6617
6618        // Node 2 should be called by 0 and 1
6619        assert!(incoming[2].contains(&0));
6620        assert!(incoming[2].contains(&1));
6621
6622        // Node 0 should call 1 and 2
6623        assert!(outgoing[0].contains(&1));
6624        assert!(outgoing[0].contains(&2));
6625    }
6626
6627    /// G1 (R2.3 issue a): A scoped call `mod_a::foo()` must store:
6628    /// - `name = "foo"` (bare identifier, for def-index lookup)
6629    /// - `qualified_path = Some("mod_a::foo")` (full path, for disambiguation)
6630    ///
6631    /// Before G1, `name` stored the full `"mod_a::foo"` path. After G1, `name`
6632    /// is always the bare trailing identifier and `qualified_path` carries the
6633    /// full path when the call is scoped.
6634    #[test]
6635    fn test_scoped_identifier_calls_preserve_path() {
6636        use crate::languages;
6637        use streaming_iterator::StreamingIterator as _;
6638
6639        let source = "
6640mod mod_a {
6641    pub fn foo() {}
6642}
6643mod mod_b {
6644    pub fn foo() {}
6645}
6646fn caller() {
6647    mod_a::foo();
6648    mod_b::foo();
6649}
6650";
6651        let call_config =
6652            languages::call_query_for_extension("rs").expect("Rust call config must exist");
6653        let lang_config =
6654            languages::config_for_extension("rs").expect("Rust lang config must exist");
6655
6656        let mut defs = {
6657            let mut parser = tree_sitter::Parser::new();
6658            parser.set_language(&lang_config.language).unwrap();
6659            let tree = parser.parse(source, None).unwrap();
6660            let mut cursor = tree_sitter::QueryCursor::new();
6661            let mut out = Vec::new();
6662            let mut matches =
6663                cursor.matches(&lang_config.query, tree.root_node(), source.as_bytes());
6664            while let Some(m) = matches.next() {
6665                let mut name = String::new();
6666                let mut def_node = None;
6667                for cap in m.captures {
6668                    let cname = &lang_config.query.capture_names()[cap.index as usize];
6669                    if *cname == "name" {
6670                        name = source[cap.node.start_byte()..cap.node.end_byte()].to_string();
6671                    } else if *cname == "def" {
6672                        def_node = Some(cap.node);
6673                    }
6674                }
6675                if let Some(node) = def_node {
6676                    #[expect(clippy::cast_possible_truncation)]
6677                    out.push(Definition {
6678                        name,
6679                        kind: node.kind().to_string(),
6680                        start_line: node.start_position().row as u32 + 1,
6681                        end_line: node.end_position().row as u32 + 1,
6682                        scope: String::new(),
6683                        signature: None,
6684                        start_byte: node.start_byte() as u32,
6685                        end_byte: node.end_byte() as u32,
6686                        calls: vec![],
6687                        decorator: None,
6688                        lsp_kind_hint: None,
6689                    });
6690                }
6691            }
6692            out
6693        };
6694
6695        extract_calls(source, &call_config, &mut defs);
6696
6697        // Find the `caller` function definition
6698        let caller_def = defs
6699            .iter()
6700            .find(|d| d.name == "caller")
6701            .expect("caller def");
6702
6703        // G1: bare name is "foo", qualified_path carries the module path.
6704        let call_names: Vec<&str> = caller_def.calls.iter().map(|c| c.name.as_str()).collect();
6705        let qualified_paths: Vec<Option<&str>> = caller_def
6706            .calls
6707            .iter()
6708            .map(|c| c.qualified_path.as_deref())
6709            .collect();
6710
6711        // Bare names must be the trailing identifier only.
6712        assert!(
6713            call_names.contains(&"foo"),
6714            "bare name 'foo' must appear for scoped calls; got: {call_names:?}"
6715        );
6716        // Qualified paths must carry the full scope.
6717        assert!(
6718            qualified_paths.contains(&Some("mod_a::foo")),
6719            "qualified_path 'mod_a::foo' must appear; got: {qualified_paths:?}"
6720        );
6721        assert!(
6722            qualified_paths.contains(&Some("mod_b::foo")),
6723            "qualified_path 'mod_b::foo' must appear; got: {qualified_paths:?}"
6724        );
6725        // Full paths must NOT appear in bare names.
6726        assert!(
6727            !call_names.contains(&"mod_a::foo"),
6728            "full path 'mod_a::foo' must not appear in bare name; got: {call_names:?}"
6729        );
6730    }
6731
6732    /// RED test (R2.3 issue b+c): Two defs named `Read` in different modules,
6733    /// an unqualified call to `Read`. Resolution must NOT silently pick the first.
6734    /// Either both are returned (ambiguous) or none.
6735    #[test]
6736    fn test_ambiguous_name_resolution_returns_all_or_none() {
6737        // Build two FileNodes each with a def named "Read", then a third with an
6738        // unqualified call to "Read".
6739        let file_a = FileNode {
6740            path: "mod_a.rs".to_string(),
6741            defs: vec![Definition {
6742                name: "Read".to_string(),
6743                kind: "trait_item".to_string(),
6744                start_line: 1,
6745                end_line: 3,
6746                scope: String::new(),
6747                signature: None,
6748                start_byte: 0,
6749                end_byte: 50,
6750                calls: vec![],
6751                decorator: None,
6752                lsp_kind_hint: None,
6753            }],
6754            imports: vec![],
6755        };
6756        let file_b = FileNode {
6757            path: "mod_b.rs".to_string(),
6758            defs: vec![Definition {
6759                name: "Read".to_string(),
6760                kind: "trait_item".to_string(),
6761                start_line: 1,
6762                end_line: 3,
6763                scope: String::new(),
6764                signature: None,
6765                start_byte: 0,
6766                end_byte: 50,
6767                calls: vec![],
6768                decorator: None,
6769                lsp_kind_hint: None,
6770            }],
6771            imports: vec![],
6772        };
6773        let file_c = FileNode {
6774            path: "caller.rs".to_string(),
6775            defs: vec![Definition {
6776                name: "do_thing".to_string(),
6777                kind: "function_item".to_string(),
6778                start_line: 1,
6779                end_line: 5,
6780                scope: String::new(),
6781                signature: None,
6782                start_byte: 0,
6783                end_byte: 100,
6784                calls: vec![CallRef {
6785                    name: "Read".to_string(),
6786                    qualified_path: None,
6787                    receiver_type: None,
6788                    byte_offset: 10,
6789                    resolved: None,
6790                }],
6791                decorator: None,
6792                lsp_kind_hint: None,
6793            }],
6794            imports: vec![],
6795        };
6796
6797        let mut files = vec![file_a, file_b, file_c];
6798        let def_index = build_def_index(&files);
6799        resolve_calls(&mut files, &def_index, &HashMap::new());
6800
6801        // The unqualified call to "Read" is ambiguous (two candidates, neither in same
6802        // file nor imported). Resolution must leave it as None — silent first-wins is wrong.
6803        let resolved = files[2].defs[0].calls[0].resolved;
6804        assert_eq!(
6805            resolved, None,
6806            "ambiguous unqualified call with no import context must resolve to None, not silently pick first"
6807        );
6808    }
6809
6810    // ── D1 / D2 tests ────────────────────────────────────────────────
6811
6812    /// Build a small test graph with N files and an optional JSON-extension file.
6813    fn build_test_graph(n_code: usize, include_json: bool) -> (RepoGraph, Vec<usize>) {
6814        let mut file_nodes: Vec<FileNode> = (0..n_code)
6815            .map(|i| FileNode {
6816                path: format!("src/file_{i}.rs"),
6817                defs: vec![
6818                    Definition {
6819                        name: format!("func_{i}"),
6820                        kind: "function_item".to_string(),
6821                        start_line: 1,
6822                        end_line: 5,
6823                        scope: String::new(),
6824                        signature: Some(format!("fn func_{i}() -> i32")),
6825                        start_byte: 0,
6826                        end_byte: 100,
6827                        calls: vec![],
6828                        decorator: None,
6829                        lsp_kind_hint: None,
6830                    },
6831                    Definition {
6832                        name: format!("MyStruct{i}"),
6833                        kind: "struct_item".to_string(),
6834                        start_line: 7,
6835                        end_line: 10,
6836                        scope: String::new(),
6837                        signature: None,
6838                        start_byte: 110,
6839                        end_byte: 200,
6840                        calls: vec![],
6841                        decorator: None,
6842                        lsp_kind_hint: None,
6843                    },
6844                ],
6845                imports: vec![],
6846            })
6847            .collect();
6848
6849        let json_idx = if include_json {
6850            let idx = file_nodes.len();
6851            file_nodes.push(FileNode {
6852                path: "data/config.json".to_string(),
6853                defs: vec![],
6854                imports: vec![],
6855            });
6856            vec![idx]
6857        } else {
6858            vec![]
6859        };
6860
6861        // Build a star graph: all code files point to file_0.
6862        let n = file_nodes.len();
6863        #[expect(clippy::cast_possible_truncation, reason = "test: n_code << u32::MAX")]
6864        let edges: Vec<(u32, u32, u32)> = (1..n_code).map(|i| (i as u32, 0, 1)).collect();
6865
6866        let base_ranks = pagerank(n, &edges, None);
6867        let (callers, callees) = build_neighbor_lists(n, &edges);
6868
6869        let graph = RepoGraph {
6870            files: file_nodes,
6871            edges,
6872            base_ranks,
6873            callers,
6874            callees,
6875            def_edges: vec![],
6876            def_ranks: vec![],
6877            def_callers: vec![],
6878            def_callees: vec![],
6879            def_offsets: vec![0],
6880            alpha: 0.5,
6881        };
6882
6883        (graph, json_idx)
6884    }
6885
6886    /// D1: `render_json` returns a `GetRepoMapResponse` with a `files` array.
6887    ///
6888    /// On the baseline (before D1) `get_repo_map_ripvec` returned markdown prose via
6889    /// `repo_map::render`; no `files` key existed in the output.
6890    #[test]
6891    fn get_repo_map_returns_json_with_files_array() {
6892        let (graph, _) = build_test_graph(5, false);
6893        let response = render_json(&graph, 50, None, false);
6894        assert!(
6895            !response.files.is_empty(),
6896            "files array should be non-empty for a non-empty graph"
6897        );
6898        // Serialize and verify the JSON shape has a `files` key.
6899        let json = serde_json::to_string(&response).expect("serialize");
6900        let parsed: serde_json::Value = serde_json::from_str(&json).expect("parse");
6901        assert!(
6902            parsed["files"].is_array(),
6903            "serialized response must have a `files` JSON array; got: {parsed}"
6904        );
6905    }
6906
6907    /// D1: every file entry has an `lsp_location` field.
6908    ///
6909    /// Before D1, output was prose text; no `lsp_location` existed anywhere in the response.
6910    #[test]
6911    fn get_repo_map_each_file_has_lsp_location() {
6912        let (graph, _) = build_test_graph(5, false);
6913        let response = render_json(&graph, 50, None, false);
6914        for file in &response.files {
6915            assert!(
6916                !file.lsp_location.file_path.is_empty(),
6917                "each file must have a non-empty lsp_location.file_path"
6918            );
6919        }
6920        // Also verify through JSON.
6921        let json = serde_json::to_string(&response).expect("serialize");
6922        let parsed: serde_json::Value = serde_json::from_str(&json).expect("parse");
6923        for entry in parsed["files"].as_array().expect("files array") {
6924            assert!(
6925                entry["lsp_location"]["file_path"].is_string(),
6926                "each file entry must have lsp_location.file_path string; entry: {entry}"
6927            );
6928        }
6929    }
6930
6931    /// D1: every symbol has a `kind` (u32) and an `lsp_location`.
6932    ///
6933    /// Before D1 symbols were rendered as prose strings like `"function_item func_0"`.
6934    #[test]
6935    fn get_repo_map_each_symbol_has_kind_and_lsp_location() {
6936        let (graph, _) = build_test_graph(3, false);
6937        let response = render_json(&graph, 50, None, false);
6938        for file in &response.files {
6939            for sym in &file.symbols {
6940                assert!(
6941                    sym.kind > 0,
6942                    "symbol kind must be a positive LSP SymbolKind; got 0 for '{}'",
6943                    sym.name
6944                );
6945                assert!(
6946                    !sym.lsp_location.file_path.is_empty(),
6947                    "symbol must have lsp_location.file_path"
6948                );
6949            }
6950        }
6951        // Verify through JSON: kind should be a number.
6952        let json = serde_json::to_string(&response).expect("serialize");
6953        let parsed: serde_json::Value = serde_json::from_str(&json).expect("parse");
6954        for file_entry in parsed["files"].as_array().expect("files") {
6955            for sym_entry in file_entry["symbols"].as_array().expect("symbols") {
6956                assert!(
6957                    sym_entry["kind"].is_number(),
6958                    "symbol `kind` must be a JSON number; sym: {sym_entry}"
6959                );
6960                assert!(
6961                    sym_entry["lsp_location"]["file_path"].is_string(),
6962                    "symbol must have lsp_location.file_path; sym: {sym_entry}"
6963                );
6964            }
6965        }
6966    }
6967
6968    /// D1: `calls` field is an array of `RepoMapCall`-shaped objects (each has
6969    /// `lsp_location` and `rank`).
6970    ///
6971    /// In 4.0.1 calls moved from bare `lsp_location` objects to `RepoMapCall`
6972    /// objects that carry both the target `lsp_location` and the target file's
6973    /// `base_rank`.
6974    #[test]
6975    fn get_repo_map_calls_field_is_array_of_lsp_locations() {
6976        // Build a 5-file star graph so file_0 has non-empty callees.
6977        let (graph, _) = build_test_graph(5, false);
6978        let response = render_json(&graph, 50, None, false);
6979        let json = serde_json::to_string(&response).expect("serialize");
6980        let parsed: serde_json::Value = serde_json::from_str(&json).expect("parse");
6981        for file_entry in parsed["files"].as_array().expect("files") {
6982            let calls = file_entry["calls"]
6983                .as_array()
6984                .expect("calls must be an array");
6985            for call in calls {
6986                // In 4.0.1 each call entry is a RepoMapCall with lsp_location + rank.
6987                assert!(
6988                    call["lsp_location"]["file_path"].is_string(),
6989                    "each call entry must have lsp_location.file_path string; call: {call}"
6990                );
6991                assert!(
6992                    call["rank"].is_number(),
6993                    "each call entry must have a numeric rank; call: {call}"
6994                );
6995            }
6996        }
6997    }
6998
6999    /// D2 / G3: `render_json_budgeted` with a very tight budget returns fewer files.
7000    ///
7001    /// Before the budget allocator, `max_files=3` controlled file count but not
7002    /// per-file expansion. In 4.0.1 the token_budget controls total bytes; with
7003    /// a budget of 1 token (= 4 bytes) only the envelope minimum allows any file
7004    /// at all, and the test verifies that the total_files counter still reflects
7005    /// the full eligible count. `render_json` (compat shim) passes a generous
7006    /// budget; use `render_json_budgeted` with a tight budget to verify the cap.
7007    #[test]
7008    fn get_repo_map_returns_at_most_max_files_files() {
7009        let (graph, _) = build_test_graph(10, false);
7010        // Use render_json_budgeted directly with a tight budget (600 bytes = 3 files
7011        // × 200-byte floor). Each file's envelope minimum is 200 bytes so a 600-byte
7012        // budget should admit at most 3 files.
7013        let response = render_json_budgeted(&graph, 150, None, false);
7014        assert!(
7015            response.files.len() <= 3,
7016            "files.len() = {} must be <= 3 for a 600-byte budget",
7017            response.files.len()
7018        );
7019        assert_eq!(
7020            response.total_files, 10,
7021            "total_files must reflect the full eligible count before budget cap"
7022        );
7023        assert!(
7024            response.capped,
7025            "capped must be true when total_files > files.len()"
7026        );
7027    }
7028
7029    /// D2: `include_metadata=false` (default) excludes JSON/TOML/etc. files.
7030    ///
7031    /// Before D2, JSON files with thousands of repeated keys dominated the
7032    /// output (Issue #5 — JSON-key flooding).
7033    #[test]
7034    fn get_repo_map_excludes_meta_by_default() {
7035        let (graph, _) = build_test_graph(3, /*include_json=*/ true);
7036        // Default: include_metadata = false
7037        let response = render_json(&graph, 50, None, false);
7038        for file in &response.files {
7039            assert!(
7040                !std::path::Path::new(&file.lsp_location.file_path)
7041                    .extension()
7042                    .is_some_and(|e| e.eq_ignore_ascii_case("json")),
7043                "JSON (Meta) files must be excluded when include_metadata=false; found: {}",
7044                file.lsp_location.file_path
7045            );
7046        }
7047    }
7048
7049    /// D2: `include_metadata=true` includes JSON files.
7050    ///
7051    /// Callers who opt-in to metadata should see all content kinds.
7052    #[test]
7053    fn get_repo_map_include_metadata_true_includes_json() {
7054        let (graph, _) = build_test_graph(3, /*include_json=*/ true);
7055        let response = render_json(&graph, 50, None, true);
7056        let has_json = response.files.iter().any(|f| {
7057            std::path::Path::new(&f.lsp_location.file_path)
7058                .extension()
7059                .is_some_and(|e| e.eq_ignore_ascii_case("json"))
7060        });
7061        assert!(
7062            has_json,
7063            "JSON file must be present when include_metadata=true"
7064        );
7065    }
7066
7067    /// J1/J2 MEASUREMENT: flask corpus focus_file=blueprints.py rank dispersion.
7068    ///
7069    /// Mandatory measurement from the 4.0.5 Wave-2 Front-C briefing:
7070    /// - `len(files) >= 8` (not collapsed to just the focus)
7071    /// - focus file rank is the highest in the response
7072    /// - next 5 files all have rank >= 10% of focus rank
7073    /// - neighborhood contains semantically related files (app.py, scaffold.py)
7074    #[test]
7075    #[ignore = "runs on flask corpus at tests/corpus/code/flask; use --ignored --nocapture"]
7076    #[expect(
7077        clippy::too_many_lines,
7078        reason = "end-to-end corpus measurement test; assertion sequence is sequential and cannot be meaningfully split"
7079    )]
7080    fn test_flask_focus_blueprints_rank_dispersion() {
7081        let corpus_root = Path::new(env!("CARGO_MANIFEST_DIR"))
7082            .parent()
7083            .unwrap()
7084            .parent()
7085            .unwrap()
7086            .join("tests/corpus/code/flask");
7087
7088        assert!(
7089            corpus_root.exists(),
7090            "flask corpus not found at {}",
7091            corpus_root.display()
7092        );
7093
7094        let graph = build_graph(&corpus_root).expect("build_graph on flask corpus");
7095        eprintln!("Flask corpus: {} files in graph", graph.files.len());
7096
7097        // Find focus file
7098        let focus_path = "src/flask/blueprints.py";
7099        let focus_idx = graph.files.iter().position(|f| f.path == focus_path);
7100        eprintln!("Focus file '{focus_path}' -> idx: {focus_idx:?}");
7101        assert!(
7102            focus_idx.is_some(),
7103            "blueprints.py not found in graph; available files: {:?}",
7104            graph
7105                .files
7106                .iter()
7107                .map(|f| &f.path)
7108                .take(20)
7109                .collect::<Vec<_>>()
7110        );
7111
7112        let response = render_json_budgeted(&graph, 4000, focus_idx, false);
7113
7114        // Criterion 1: at least 8 files returned.
7115        eprintln!(
7116            "Focused response: {} files (total_files={})",
7117            response.files.len(),
7118            response.total_files
7119        );
7120        assert!(
7121            response.files.len() >= 8,
7122            "expected >= 8 files in focused response; got {} — I#16 winner-take-all collapse",
7123            response.files.len()
7124        );
7125
7126        // Print top 10 for inspection.
7127        eprintln!("\nTop 10 focused files:");
7128        for (i, f) in response.files.iter().take(10).enumerate() {
7129            eprintln!("  [{i}] rank={:.6}  {}", f.rank, f.lsp_location.file_path);
7130        }
7131
7132        // Criterion 2: focus file must appear near the top (top-3) of focused
7133        // results.  With PERSONALIZATION_ALPHA=0.15 and the flask corpus,
7134        // src/flask/app.py has higher structural rank than blueprints.py and
7135        // may legitimately rank #1 — the focus boosts blueprints.py relative
7136        // to its unfocused position, but doesn't guarantee it beats every
7137        // structurally central neighbor.  Being in top-3 confirms the bias
7138        // is working (pre-fix blueprints.py was #1 at 0.703 but that was a
7139        // degenerate collapse; now #1 or #2 is healthy).
7140        let focus_file_rank = response
7141            .files
7142            .iter()
7143            .find(|f| {
7144                f.lsp_location.file_path.contains("blueprints.py")
7145                    && !f.lsp_location.file_path.contains("test_")
7146                    && !f.lsp_location.file_path.contains("sansio")
7147            })
7148            .map(|f| f.rank)
7149            .unwrap_or(0.0);
7150        let focus_position = response
7151            .files
7152            .iter()
7153            .position(|f| {
7154                f.lsp_location.file_path.contains("blueprints.py")
7155                    && !f.lsp_location.file_path.contains("test_")
7156                    && !f.lsp_location.file_path.contains("sansio")
7157            })
7158            .unwrap_or(usize::MAX);
7159        eprintln!(
7160            "\nblueprinets.py position: #{} rank={:.6}",
7161            focus_position + 1,
7162            focus_file_rank
7163        );
7164        assert!(
7165            focus_position < 3,
7166            "blueprints.py must be in top-3 focused results (got position {}); \
7167             soft personalization must rebias toward focus neighborhood — I#16",
7168            focus_position + 1
7169        );
7170
7171        // Criterion 3: next 5 non-focus files have rank >= 10% of the top
7172        // file's rank.  This is the core dispersion check: no more Dirac-delta
7173        // collapse where one file is 0.703 and all others are 0.003.
7174        let top_rank = response.files[0].rank;
7175        let non_focus_min_5 = response
7176            .files
7177            .iter()
7178            .filter(|f| {
7179                !(f.lsp_location.file_path.contains("blueprints.py")
7180                    && !f.lsp_location.file_path.contains("test_")
7181                    && !f.lsp_location.file_path.contains("sansio"))
7182            })
7183            .take(5)
7184            .map(|f| f.rank)
7185            .fold(f32::INFINITY, f32::min);
7186        let pct = non_focus_min_5 / top_rank * 100.0;
7187        eprintln!(
7188            "\nNext-5 (non-focus) min rank: {non_focus_min_5:.6} = {pct:.1}% of top rank {top_rank:.6}"
7189        );
7190        assert!(
7191            pct >= 10.0,
7192            "next-5 non-focus files min rank is {pct:.1}% of top rank (need ≥ 10%); \
7193             files are collapsing to near-zero floor — I#16"
7194        );
7195
7196        // Criterion 4: neighborhood quality — related files present.
7197        let related_names = ["app.py", "scaffold.py", "sansio"];
7198        let found_related: Vec<&str> = related_names
7199            .iter()
7200            .copied()
7201            .filter(|name| {
7202                response
7203                    .files
7204                    .iter()
7205                    .any(|f| f.lsp_location.file_path.contains(name))
7206            })
7207            .collect();
7208        eprintln!("\nNeighborhood quality: found related files: {found_related:?}");
7209        // At least one related file should appear (soft assertion — log if missing).
7210        if found_related.is_empty() {
7211            eprintln!(
7212                "WARNING: no expected related files (app.py, scaffold.py) found in neighborhood"
7213            );
7214        }
7215    }
7216
7217    #[test]
7218    #[ignore = "runs on full ripvec codebase; use --nocapture to see output"]
7219    fn test_full_repo_map() {
7220        use std::time::Instant;
7221
7222        let root = Path::new(env!("CARGO_MANIFEST_DIR"))
7223            .parent()
7224            .unwrap()
7225            .parent()
7226            .unwrap();
7227
7228        // Phase 1: build_graph (walk + parse + import resolve + PageRank)
7229        let t0 = Instant::now();
7230        let graph = build_graph(root).expect("build_graph on ripvec root");
7231        let build_ms = t0.elapsed().as_secs_f64() * 1000.0;
7232
7233        // Phase 2: render (default, no focus)
7234        let t1 = Instant::now();
7235        let rendered = render(&graph, 2000, None);
7236        let render_ms = t1.elapsed().as_secs_f64() * 1000.0;
7237
7238        // Phase 3: render (topic-sensitive, focused on highest-ranked file)
7239        let t2 = Instant::now();
7240        let focus_idx = graph
7241            .base_ranks
7242            .iter()
7243            .enumerate()
7244            .max_by(|a, b| a.1.total_cmp(b.1))
7245            .map(|(i, _)| i);
7246        let focused = render(&graph, 2000, focus_idx);
7247        let focus_ms = t2.elapsed().as_secs_f64() * 1000.0;
7248
7249        eprintln!("\n=== Repo Map Performance ===");
7250        eprintln!(
7251            "Files: {}, Edges: {}, Defs: {}",
7252            graph.files.len(),
7253            graph.edges.len(),
7254            graph.files.iter().map(|f| f.defs.len()).sum::<usize>()
7255        );
7256        eprintln!("build_graph:     {build_ms:.1}ms (walk + parse + resolve + PageRank)");
7257        eprintln!(
7258            "render(default): {render_ms:.3}ms ({} chars, ~{} tokens)",
7259            rendered.len(),
7260            rendered.len() / 4
7261        );
7262        eprintln!(
7263            "render(focused): {focus_ms:.3}ms ({} chars, ~{} tokens)",
7264            focused.len(),
7265            focused.len() / 4
7266        );
7267
7268        eprintln!("\nTop 5 by PageRank:");
7269        let mut ranked: Vec<(usize, f32)> = graph.base_ranks.iter().copied().enumerate().collect();
7270        ranked.sort_by(|a, b| b.1.total_cmp(&a.1));
7271        for (i, rank) in ranked.iter().take(5) {
7272            eprintln!("  {:.4} {}", rank, graph.files[*i].path);
7273        }
7274
7275        eprintln!("\n=== Default Render ===\n{rendered}");
7276        eprintln!(
7277            "\n=== Focused Render (on {}) ===\n{focused}",
7278            focus_idx
7279                .map(|i| graph.files[i].path.as_str())
7280                .unwrap_or("none")
7281        );
7282    }
7283
7284    // ── C1/C2 Tests (4.1.1): Python decorator-aware kind classification ──
7285
7286    /// `test:chunker_stores_property_decorator_kind` — a Python `@property`-
7287    /// decorated function produces a [`Definition`] with
7288    /// `decorator = Some("property")` and `lsp_kind_hint = Some(7)`.
7289    ///
7290    /// Behavior: trigger-fails-on-baseline-then-passes-post-fix.
7291    /// Baseline: `Definition` had no `decorator` / `lsp_kind_hint` fields.
7292    /// Post-fix: `extract_definitions` populates both from the AST.
7293    #[test]
7294    fn test_chunker_stores_property_decorator_kind() {
7295        let source = "@property\ndef foo(self):\n    return self._x\n";
7296        let lang_cfg = crate::languages::config_for_extension("py").expect("python lang config");
7297        let defs = extract_definitions(source, &lang_cfg);
7298
7299        let foo = defs
7300            .iter()
7301            .find(|d| d.name == "foo")
7302            .expect("expected def 'foo'");
7303        assert_eq!(
7304            foo.decorator.as_deref(),
7305            Some("property"),
7306            "decorator must be Some(\"property\") for @property def; got {:?}",
7307            foo.decorator
7308        );
7309        assert_eq!(
7310            foo.lsp_kind_hint,
7311            Some(7),
7312            "lsp_kind_hint must be Some(7) (Property) for @property def; got {:?}",
7313            foo.lsp_kind_hint
7314        );
7315    }
7316
7317    /// `test:chunker_stores_classmethod_decorator_kind` — a Python `@classmethod`-
7318    /// decorated function produces a [`Definition`] with
7319    /// `decorator = Some("classmethod")` and `lsp_kind_hint = Some(12)`.
7320    #[test]
7321    fn test_chunker_stores_classmethod_decorator_kind() {
7322        let source = "@classmethod\ndef from_dict(cls, d):\n    pass\n";
7323        let lang_cfg = crate::languages::config_for_extension("py").expect("python lang config");
7324        let defs = extract_definitions(source, &lang_cfg);
7325
7326        let from_dict = defs
7327            .iter()
7328            .find(|d| d.name == "from_dict")
7329            .expect("expected def 'from_dict'");
7330        assert_eq!(
7331            from_dict.decorator.as_deref(),
7332            Some("classmethod"),
7333            "decorator must be Some(\"classmethod\"); got {:?}",
7334            from_dict.decorator
7335        );
7336        assert_eq!(
7337            from_dict.lsp_kind_hint,
7338            Some(12),
7339            "lsp_kind_hint must be Some(12) (Function) for @classmethod def; got {:?}",
7340            from_dict.lsp_kind_hint
7341        );
7342    }
7343
7344    /// `test:chunker_stores_arbitrary_decorator_name` — a Python
7345    /// `@functools.lru_cache`-decorated function (attribute-access decorator)
7346    /// produces a [`Definition`] with `decorator = Some("functools.lru_cache")`
7347    /// and `lsp_kind_hint = Some(12)`.
7348    #[test]
7349    fn test_chunker_stores_arbitrary_decorator_name() {
7350        let source = "@functools.lru_cache\ndef expensive(n):\n    pass\n";
7351        let lang_cfg = crate::languages::config_for_extension("py").expect("python lang config");
7352        let defs = extract_definitions(source, &lang_cfg);
7353
7354        let expensive = defs
7355            .iter()
7356            .find(|d| d.name == "expensive")
7357            .expect("expected def 'expensive'");
7358        // Attribute decorators are stored as dotted-name text.
7359        assert_eq!(
7360            expensive.decorator.as_deref(),
7361            Some("functools.lru_cache"),
7362            "decorator must be Some(\"functools.lru_cache\"); got {:?}",
7363            expensive.decorator
7364        );
7365        assert_eq!(
7366            expensive.lsp_kind_hint,
7367            Some(12),
7368            "lsp_kind_hint must be Some(12) (Function) for @functools.lru_cache def; got {:?}",
7369            expensive.lsp_kind_hint
7370        );
7371    }
7372
7373    /// `test:repo_map_projection_uses_stored_kind_for_python_decorator` — the
7374    /// `render_json_budgeted` projection site reads `lsp_kind_hint` from the
7375    /// stored [`Definition`] rather than re-computing via `lsp_symbol_kind_for_node_kind`.
7376    ///
7377    /// A synthetic [`RepoGraph`] with a Definition whose `kind = "decorated_definition"`
7378    /// (which would map to Property=7 via the AST-less path) but
7379    /// `lsp_kind_hint = Some(12)` must produce a [`RepoMapSymbol`] with `kind = 12`.
7380    #[test]
7381    fn test_repo_map_projection_uses_stored_kind_for_python_decorator() {
7382        // Build a synthetic graph: one file, one @classmethod-decorated function.
7383        // kind = "decorated_definition" (AST-less → 7 = Property, WRONG).
7384        // lsp_kind_hint = Some(12) (stored at parse time, CORRECT).
7385        let files = vec![FileNode {
7386            path: "module.py".to_string(),
7387            defs: vec![Definition {
7388                name: "from_dict".to_string(),
7389                kind: "decorated_definition".to_string(),
7390                start_line: 1,
7391                end_line: 3,
7392                scope: String::new(),
7393                signature: None,
7394                start_byte: 0,
7395                end_byte: 60,
7396                calls: vec![],
7397                decorator: Some("classmethod".to_string()),
7398                lsp_kind_hint: Some(12),
7399            }],
7400            imports: vec![],
7401        }];
7402        let graph = build_graph_from_files_pub(files);
7403        let result = render_json_budgeted(&graph, 4000, None, false);
7404
7405        // Locate the symbol in the result.
7406        let file = result
7407            .files
7408            .iter()
7409            .find(|f| f.lsp_location.file_path.contains("module.py"))
7410            .expect("module.py must appear in render output");
7411        let sym = file
7412            .symbols
7413            .iter()
7414            .find(|s| s.name == "from_dict")
7415            .expect("from_dict must appear as a symbol");
7416        assert_eq!(
7417            sym.kind, 12,
7418            "C2: lsp_kind_hint=12 must override the AST-less kind (7 for decorated_definition); got {}",
7419            sym.kind
7420        );
7421    }
7422}
ripvec_core/repo_map.rs

ripvec_core/
repo_map.rs