ripvec-core 4.0.7

//! `PageRank`-weighted structural overview of a codebase.
//!
//! Builds a dependency graph from tree-sitter definition and import extraction,
//! ranks files by importance using `PageRank` (standard or topic-sensitive), and
//! renders a budget-constrained overview with tiered detail levels.

use std::collections::HashMap;
use std::fmt::Write as _;
use std::path::{Path, PathBuf};

use rayon::prelude::*;
use rkyv::{Archive, Deserialize as RkyvDeserialize, Serialize as RkyvSerialize};
use streaming_iterator::StreamingIterator;
use tree_sitter::{Parser, Query, QueryCursor};

use serde::Serialize;

use crate::chunk::ContentKind;
use crate::languages;
use crate::walk;

/// Serialize a `ContentKind` to a lowercase string tag for JSON output.
fn content_kind_tag(ck: ContentKind) -> &'static str {
    match ck {
        ContentKind::Code => "code",
        ContentKind::Docs => "docs",
        ContentKind::Meta => "meta",
    }
}

// ── Data Structures ──────────────────────────────────────────────────

/// Persisted dependency graph with `PageRank` scores.
#[derive(Debug, Clone, Archive, RkyvSerialize, RkyvDeserialize)]
pub struct RepoGraph {
    /// Files in the repository with definitions, imports, and calls.
    pub files: Vec<FileNode>,
    /// File-level edges (derived from def-level call edges).
    pub edges: Vec<(u32, u32, u32)>,
    /// File-level `PageRank` scores (aggregated from def-level).
    pub base_ranks: Vec<f32>,
    /// File-level callers (indices into `files`).
    pub callers: Vec<Vec<u32>>,
    /// File-level callees (indices into `files`).
    pub callees: Vec<Vec<u32>>,
    /// Definition-level call edges: `(caller_def, callee_def, weight)`.
    pub def_edges: Vec<(DefId, DefId, u32)>,
    /// Definition-level `PageRank` scores (flattened: `offsets[file_idx] + def_idx`).
    pub def_ranks: Vec<f32>,
    /// Definition-level callers (flattened, parallel to `def_ranks`).
    pub def_callers: Vec<Vec<DefId>>,
    /// Definition-level callees (flattened, parallel to `def_ranks`).
    pub def_callees: Vec<Vec<DefId>>,
    /// Prefix-sum offsets for flattening `DefId` to linear index.
    pub def_offsets: Vec<usize>,
    /// Auto-tuned alpha for search boost.
    pub alpha: f32,
}

/// A file in the repository with its definitions and imports.
#[derive(Debug, Clone, Archive, RkyvSerialize, RkyvDeserialize)]
pub struct FileNode {
    /// Relative path from the repository root.
    pub path: String,
    /// Definitions (functions, structs, classes, etc.) extracted from this file.
    pub defs: Vec<Definition>,
    /// Import references extracted from this file.
    pub imports: Vec<ImportRef>,
}

/// A definition extracted from a source file.
#[derive(Debug, Clone, Archive, RkyvSerialize, RkyvDeserialize)]
pub struct Definition {
    /// Name of the definition (e.g., function name, class name).
    pub name: String,
    /// Kind of syntax node (e.g., `function_item`, `class_definition`).
    pub kind: String,
    /// 1-based start line number.
    pub start_line: u32,
    /// 1-based end line number.
    pub end_line: u32,
    /// Scope chain (e.g., `"impl_item Foo > fn bar"`).
    pub scope: String,
    /// Function/method signature, if available.
    pub signature: Option<String>,
    /// Byte offset of this definition's start in the source file.
    pub start_byte: u32,
    /// Byte offset of this definition's end in the source file.
    pub end_byte: u32,
    /// Call sites within this definition's body.
    pub calls: Vec<CallRef>,
}

/// An import reference extracted from a source file.
#[derive(Debug, Clone, Archive, RkyvSerialize, RkyvDeserialize)]
pub struct ImportRef {
    /// Raw import path as written in source (e.g., `crate::foo::bar`).
    pub raw_path: String,
    /// Resolved file index in [`RepoGraph::files`], if resolution succeeded.
    pub resolved_idx: Option<u32>,
}

/// Unique identifier for a definition: (file index, definition index within file).
pub type DefId = (u32, u16);

/// A call site extracted from a definition body.
#[derive(Debug, Clone, Default, Archive, RkyvSerialize, RkyvDeserialize)]
pub struct CallRef {
    /// Callee function/method name (bare, without qualifier).
    ///
    /// For scoped calls like `mod_a::foo()`, this is `"foo"`.
    /// For bare calls like `foo()`, this is `"foo"`.
    pub name: String,
    /// Full qualified path for scoped calls, e.g. `Some("mod_a::foo")`.
    ///
    /// `None` for bare (unqualified) calls. When `Some`, `resolve_calls`
    /// uses this for qualifier-based module disambiguation before falling
    /// back to the bare `name`.
    pub qualified_path: Option<String>,
    /// Receiver type for method calls, inferred from local context.
    ///
    /// Set to `Some("Foo")` when:
    /// - The call is `self.method()` inside `impl Foo { … }`.
    /// - The call is `x.method()` where `x` has an explicit type annotation `x: Foo`.
    /// - The call is `x.method()` after `let x = Foo::new()`.
    ///
    /// `None` for free function calls, or when the receiver type cannot be
    /// inferred from local context alone. When `Some`, `resolve_calls` prefers
    /// defs whose enclosing impl scope matches the receiver type.
    pub receiver_type: Option<String>,
    /// Byte offset of the call in the source file (for scoping to definitions).
    pub byte_offset: u32,
    /// Resolved target definition, if resolution succeeded.
    pub resolved: Option<DefId>,
}

// ── JSON output types ────────────────────────────────────────────────

/// LSP-shaped location pointing at a file or symbol within a file.
///
/// Lines and characters are 0-based, matching the Language Server Protocol
/// convention so callers can pass this directly to LSP tools without any
/// conversion.
#[derive(Debug, Clone, Serialize)]
pub struct RepoMapLspLocation {
    /// Relative path from the repository root (prefixed with `./`).
    pub file_path: String,
    /// 0-based start line.
    pub start_line: usize,
    /// 0-based start character (0 for file-level locations).
    pub start_character: usize,
    /// 0-based end line (equals `start_line` for file-level locations).
    pub end_line: usize,
    /// 0-based end character (0 for file-level locations).
    pub end_character: usize,
}

/// A top-level symbol extracted from a file in the repository map.
///
/// Analogous to an LSP `DocumentSymbol` but limited to the fields available
/// from tree-sitter definition extraction. The `rank` field carries the
/// definition-level `PageRank` score from [`RepoGraph::def_ranks`], enabling
/// callers to prioritise symbols by structural importance.
#[derive(Debug, Clone, Serialize)]
pub struct RepoMapSymbol {
    /// Symbol name (function name, struct name, etc.).
    pub name: String,
    /// LSP `SymbolKind` as a decimal — use the same values as
    /// `lsp_workspace_symbols` and `lsp_document_symbols`.
    pub kind: u32,
    /// Location pointing at the symbol's definition line (0-based).
    pub lsp_location: RepoMapLspLocation,
    /// Definition-level `PageRank` score from [`RepoGraph::def_ranks`].
    ///
    /// Higher values indicate definitions that are called by many other
    /// definitions. Used by the token-budget allocator to decide which
    /// symbols to include when the per-file budget is constrained.
    pub rank: f32,
}

/// An outgoing call-edge from a file to another file.
///
/// Carries both the target file's `lsp_location` and its `base_rank`
/// (file-level `PageRank` score) so callers can decide how important
/// each dependency is without a separate lookup.
#[derive(Debug, Clone, Serialize)]
pub struct RepoMapCall {
    /// Location pointing at the target file (line 0, character 0).
    pub lsp_location: RepoMapLspLocation,
    /// File-level `PageRank` score of the target file.
    pub rank: f32,
}

/// One file entry in the JSON repo map.
///
/// Carries the file's `PageRank` score, content kind, outgoing call-edges to
/// other files, and the file's top-level symbol definitions — all with
/// `lsp_location` so the caller can chain directly into LSP tools without
/// any destructuring.
#[derive(Debug, Clone, Serialize)]
pub struct RepoMapFile {
    /// Location pointing at the file itself (line 0, character 0).
    ///
    /// Pass `lsp_location.file_path` directly into `lsp_document_symbols` or
    /// any other file-scoped tool.
    pub lsp_location: RepoMapLspLocation,
    /// `PageRank` score in [0, 1] (higher = more structurally central).
    pub rank: f32,
    /// Content classification: `"code"`, `"docs"`, or `"meta"`.
    ///
    /// Serialized as a lowercase string tag so JSON consumers can branch
    /// without numeric magic values. Mirrors the `ContentKind` enum in
    /// `ripvec-core::chunk`.
    pub content_kind: &'static str,
    /// Outgoing call-edges sorted by target file `PageRank` descending.
    pub calls: Vec<RepoMapCall>,
    /// Top-level definitions extracted from this file by tree-sitter,
    /// sorted by definition-level `PageRank` descending and pruned to
    /// the per-file token-budget allocation.
    pub symbols: Vec<RepoMapSymbol>,
    /// Number of symbols that were omitted due to budget exhaustion or
    /// logarithmic attenuation cutoff. `truncated_symbols + symbols.len()`
    /// equals the total definition count for the file.
    pub truncated_symbols: usize,
    /// Number of call-edges that were omitted due to budget exhaustion
    /// or logarithmic attenuation cutoff. `truncated_calls + calls.len()`
    /// equals the total callee count for the file.
    pub truncated_calls: usize,
}

/// JSON-mode response envelope for `get_repo_map` (4.0.1 shape).
///
/// Replaces the `max_files`-capped shape from 4.0.0. The caller supplies a
/// `token_budget`; files are allocated bytes proportional to their `PageRank`
/// (40% cap per file, 200-byte envelope floor). Symbols are filled in
/// def-rank order with a logarithmic attenuation cutoff. Leftover bytes
/// cascade to subsequent files.
///
/// The `estimated_bytes`, `budget_bytes`, and `budget_exhausted` fields give
/// callers real-time feedback on how tightly the budget was consumed.
#[derive(Debug, Clone, Serialize)]
pub struct GetRepoMapResponse {
    /// Files sorted by `PageRank` descending, pruned to the token budget.
    pub files: Vec<RepoMapFile>,
    /// Total number of eligible files in the graph (pre-allocation).
    ///
    /// If `total_files > files.len()`, the budget ran out before all files
    /// could be included. Read `budget_exhausted` directly for the boolean.
    pub total_files: usize,
    /// Actual serialised-JSON byte count for all returned content.
    pub estimated_bytes: usize,
    /// Budget ceiling in bytes that was used for allocation
    /// (`token_budget * 4`).
    pub budget_bytes: usize,
    /// `true` when `total_files > files.len()` (budget was exhausted before
    /// all eligible files were included).
    pub budget_exhausted: bool,
    /// Retained for backward compatibility with 4.0.0 callers that checked
    /// `capped`. Equivalent to `budget_exhausted`.
    pub capped: bool,
}

// ── Constants ────────────────────────────────────────────────────────

/// `PageRank` damping factor.
const DAMPING: f32 = 0.85;

/// `PageRank` convergence threshold.
const EPSILON: f32 = 1e-6;

/// Maximum `PageRank` iterations.
const MAX_ITERATIONS: usize = 100;

/// Maximum callers/callees stored per file.
const MAX_NEIGHBORS: usize = 5;

/// Approximate characters per token for budget estimation.
const CHARS_PER_TOKEN: usize = 4;

/// Concentration mass placed on the focus node in topic-sensitive `PageRank`.
///
/// Following Haveliwala 2002 ("Topic-Sensitive PageRank"), the personalization
/// vector places a small bias `α` on the focus node and distributes the
/// remaining `1 - α` uniformly over all other nodes. This preserves rank
/// dispersion across the corpus — the user sees a *neighborhood* of related
/// files rebiased toward the focus, not a Dirac delta on the focus node with
/// every other file collapsed to an equally negligible uniform floor.
///
/// Value 0.15 means:
///   - focus node teleportation probability = 0.15
///   - each of the (n - 1) other nodes = 0.85 / (n - 1)
///
/// Prior to 4.0.5, this constant was effectively 0.70 (70% bias on focus),
/// which caused winner-take-all collapse observed on the flask and ripvec
/// corpora (I#16).
const PERSONALIZATION_ALPHA: f32 = 0.15;

// ── Import Queries ───────────────────────────────────────────────────

/// Compile a tree-sitter import query for the given extension.
///
/// Returns `None` for unsupported extensions.
fn import_query_for_extension(ext: &str) -> Option<(tree_sitter::Language, Query)> {
    let (lang, query_str): (tree_sitter::Language, &str) = match ext {
        "rs" => (
            tree_sitter_rust::LANGUAGE.into(),
            "(use_declaration) @import",
        ),
        "py" | "pyi" => (
            tree_sitter_python::LANGUAGE.into(),
            concat!(
                "(import_statement) @import\n",
                "(import_from_statement) @import",
            ),
        ),
        "js" | "jsx" => (
            tree_sitter_javascript::LANGUAGE.into(),
            "(import_statement source: (string) @import_path) @import",
        ),
        "ts" => (
            tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into(),
            "(import_statement source: (string) @import_path) @import",
        ),
        "tsx" => (
            tree_sitter_typescript::LANGUAGE_TSX.into(),
            "(import_statement source: (string) @import_path) @import",
        ),
        "go" => (
            tree_sitter_go::LANGUAGE.into(),
            "(import_spec path: (interpreted_string_literal) @import_path) @import",
        ),
        // Ruby: require statements.
        "rb" => (
            tree_sitter_ruby::LANGUAGE.into(),
            "(call method: (identifier) @_method arguments: (argument_list (string (string_content) @import_path)) (#eq? @_method \"require\")) @import",
        ),
        _ => return None,
    };
    let query = match Query::new(&lang, query_str) {
        Ok(q) => q,
        Err(e) => {
            tracing::warn!(ext, %e, "import query compilation failed — language may be ABI-incompatible");
            return None;
        }
    };
    Some((lang, query))
}

/// Extract import paths from source using tree-sitter.
fn extract_imports(
    source: &str,
    lang: &tree_sitter::Language,
    import_query: &Query,
) -> Vec<String> {
    let mut parser = Parser::new();
    if parser.set_language(lang).is_err() {
        return vec![];
    }
    let Some(tree) = parser.parse(source, None) else {
        return vec![];
    };

    let mut cursor = QueryCursor::new();
    let mut imports = Vec::new();
    let mut matches = cursor.matches(import_query, tree.root_node(), source.as_bytes());

    while let Some(m) = matches.next() {
        // Prefer @import_path capture (JS/TS/Go), fall back to full @import text
        let mut import_path_text = None;
        let mut import_text = None;

        for cap in m.captures {
            let cap_name = &import_query.capture_names()[cap.index as usize];
            let text = &source[cap.node.start_byte()..cap.node.end_byte()];
            if *cap_name == "import_path" {
                import_path_text = Some(text.trim_matches(|c| c == '"' || c == '\''));
            } else if *cap_name == "import" {
                import_text = Some(text);
            }
        }

        if let Some(path) = import_path_text {
            imports.push(path.to_string());
        } else if let Some(text) = import_text {
            imports.push(text.to_string());
        }
    }

    imports
}

// ── Import Resolution ────────────────────────────────────────────────

/// Resolve a Rust `use` path to a file index in the file map.
///
/// Handles `crate::`, `self::`, and `super::` prefixes. External crate
/// imports are dropped (returns `None`).
fn resolve_rust_import(
    raw: &str,
    file_path: &Path,
    root: &Path,
    file_index: &HashMap<PathBuf, usize>,
) -> Option<usize> {
    // Extract the module path from `use crate::foo::bar;` or `use crate::foo::bar::Baz;`
    let trimmed = raw
        .trim()
        .trim_start_matches("use ")
        .trim_end_matches(';')
        .trim();

    let segments: Vec<&str> = trimmed.split("::").collect();
    if segments.is_empty() {
        return None;
    }

    // Determine the base directory and skip prefix segments
    let (base, skip) = match segments[0] {
        "crate" => {
            // Find the nearest Cargo.toml ancestor to determine the crate root.
            // In a workspace, `crate::foo` resolves relative to the crate's src/,
            // not the workspace root.
            let mut dir = file_path.parent();
            let crate_root = loop {
                match dir {
                    Some(d) if d.join("Cargo.toml").exists() => break d.join("src"),
                    Some(d) => dir = d.parent(),
                    None => break root.join("src"), // fallback
                }
            };
            (crate_root, 1)
        }
        "self" => {
            let dir = file_path.parent()?;
            (dir.to_path_buf(), 1)
        }
        "super" => {
            let dir = file_path.parent()?.parent()?;
            (dir.to_path_buf(), 1)
        }
        // External crate — drop
        _ => return None,
    };

    // Build candidate paths from the remaining segments.
    // Try progressively shorter prefixes since the last segments
    // may be items (struct, fn) rather than modules.
    let path_segments = &segments[skip..];
    for end in (1..=path_segments.len()).rev() {
        let mut candidate = base.clone();
        for seg in &path_segments[..end] {
            // Strip glob patterns like `{Foo, Bar}`
            let clean = seg.split('{').next().unwrap_or(seg).trim();
            if !clean.is_empty() {
                candidate.push(clean);
            }
        }

        // Try file.rs
        let as_file = candidate.with_extension("rs");
        if let Some(&idx) = file_index.get(&as_file) {
            return Some(idx);
        }

        // Try dir/mod.rs
        let as_mod = candidate.join("mod.rs");
        if let Some(&idx) = file_index.get(&as_mod) {
            return Some(idx);
        }
    }

    None
}

/// Resolve an import path to a file index based on file extension.
fn resolve_import(
    raw: &str,
    ext: &str,
    file_path: &Path,
    root: &Path,
    file_index: &HashMap<PathBuf, usize>,
) -> Option<usize> {
    match ext {
        "rs" => resolve_rust_import(raw, file_path, root, file_index),
        "py" | "pyi" => resolve_python_import(raw, root, file_index),
        "js" | "jsx" | "ts" | "tsx" => resolve_js_import(raw, file_path, file_index),
        // Go imports use full package paths — skip local resolution
        _ => None,
    }
}

/// Resolve a Python import to a file index.
///
/// Handles `import foo.bar` and `from foo.bar import baz` patterns.
fn resolve_python_import(
    raw: &str,
    root: &Path,
    file_index: &HashMap<PathBuf, usize>,
) -> Option<usize> {
    let module_path = if let Some(rest) = raw.strip_prefix("from ") {
        rest.split_whitespace().next()?
    } else if let Some(rest) = raw.strip_prefix("import ") {
        rest.split_whitespace().next()?
    } else {
        return None;
    };

    let rel_path: PathBuf = module_path.split('.').collect();
    for ext in ["py", "pyi"] {
        let as_file = root.join(&rel_path).with_extension(ext);
        if let Some(&idx) = file_index.get(&as_file) {
            return Some(idx);
        }
    }

    for init_name in ["__init__.py", "__init__.pyi"] {
        let as_init = root.join(&rel_path).join(init_name);
        if let Some(&idx) = file_index.get(&as_init) {
            return Some(idx);
        }
    }

    None
}

/// Resolve a JS/TS import to a file index.
///
/// Handles relative paths like `./foo` or `../bar`.
fn resolve_js_import(
    raw: &str,
    file_path: &Path,
    file_index: &HashMap<PathBuf, usize>,
) -> Option<usize> {
    if !raw.starts_with('.') {
        return None;
    }

    let dir = file_path.parent()?;
    let candidate = dir.join(raw);

    for ext in &["js", "jsx", "ts", "tsx"] {
        let with_ext = candidate.with_extension(ext);
        if let Some(&idx) = file_index.get(&with_ext) {
            return Some(idx);
        }
    }

    for ext in &["js", "jsx", "ts", "tsx"] {
        let index_file = candidate.join("index").with_extension(ext);
        if let Some(&idx) = file_index.get(&index_file) {
            return Some(idx);
        }
    }

    None
}

// ── Extraction ───────────────────────────────────────────────────────

/// Extract definitions from a source file using tree-sitter.
fn extract_definitions(source: &str, config: &languages::LangConfig) -> Vec<Definition> {
    let mut parser = Parser::new();
    if parser.set_language(&config.language).is_err() {
        return vec![];
    }
    let Some(tree) = parser.parse(source, None) else {
        return vec![];
    };

    let mut cursor = QueryCursor::new();
    let mut defs = Vec::new();
    let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());

    while let Some(m) = matches.next() {
        let mut name = String::new();
        let mut def_node = None;

        for cap in m.captures {
            let cap_name = &config.query.capture_names()[cap.index as usize];
            if *cap_name == "name" {
                name = source[cap.node.start_byte()..cap.node.end_byte()].to_string();
            } else if *cap_name == "def" {
                def_node = Some(cap.node);
            }
        }

        if let Some(node) = def_node {
            let scope = crate::chunk::build_scope_chain(node, source);
            let signature = crate::chunk::extract_signature(node, source);
            #[expect(clippy::cast_possible_truncation, reason = "line numbers fit in u32")]
            let start_line = node.start_position().row as u32 + 1;
            #[expect(clippy::cast_possible_truncation, reason = "line numbers fit in u32")]
            let end_line = node.end_position().row as u32 + 1;
            #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
            let start_byte = node.start_byte() as u32;
            #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
            let end_byte = node.end_byte() as u32;
            defs.push(Definition {
                name,
                kind: node.kind().to_string(),
                start_line,
                end_line,
                scope,
                signature,
                start_byte,
                end_byte,
                calls: vec![],
            });
        }
    }

    defs
}

// ── Call Extraction & Resolution ────────────────────────────────────

/// Tiebreak priority for def attribution when two defs share the same byte span.
///
/// Returns `0` for function-like defs (lowest value = wins in `min_by_key`) and
/// `1` for structural container defs (class bodies, impl blocks, etc.).
///
/// This resolves the Python case where the class body `block` and the first
/// `function_definition` inside it occupy identical byte ranges; calls inside
/// the function body should be attributed to the function, not the class block.
fn is_callable_def_priority(kind: &str) -> u8 {
    match kind {
        // Function / method defs: these are the correct attribution targets.
        "function_item"
        | "function_definition"
        | "function_declaration"
        | "function_signature_item"
        | "method_definition"
        | "method_declaration"
        | "method" => 0,
        // Structural containers: class body blocks, impl items, etc.
        // Prefer function-like defs over these when byte ranges tie.
        _ => 1,
    }
}

/// Extract call sites from a source file and assign them to definitions.
///
/// Uses the language's call query to find all call expressions, then
/// assigns each call to the definition whose byte range contains it.
/// Calls outside any definition body (module-level) are ignored.
///
/// For Rust scoped calls (`a::b::foo()`), the `@callee` capture returns the
/// full `scoped_identifier` node. This function splits it into:
/// - `name` = bare trailing identifier (`"foo"`)
/// - `qualified_path` = `Some("a::b::foo")` for disambiguation in `resolve_calls`.
///
/// For method calls (`x.method()`), `receiver_type` is inferred from local
/// context (parameter annotations, let-bindings, impl blocks). See
/// [`infer_receiver_types`] for the heuristic.
fn extract_calls(source: &str, call_config: &languages::CallConfig, defs: &mut [Definition]) {
    let mut parser = Parser::new();
    if parser.set_language(&call_config.language).is_err() {
        return;
    }
    let Some(tree) = parser.parse(source, None) else {
        return;
    };

    // Build receiver-type map: byte_offset_of_call → receiver_type_string.
    // Done once per file to amortise the tree walk cost.
    let receiver_map = infer_receiver_types(source, &tree, &call_config.language);

    // HCL: run the HCL-specific call-edge extractor as a post-pass so the
    // terraform_remote_state references and module blocks contribute edges
    // that the generic function_call query cannot capture (R2 + R3, Wave 3).
    if languages::is_hcl_language(&call_config.language) {
        extract_hcl_call_edges(source, tree.root_node(), defs);
    }

    let mut cursor = QueryCursor::new();
    let mut matches = cursor.matches(&call_config.query, tree.root_node(), source.as_bytes());

    while let Some(m) = matches.next() {
        let mut full_callee_text = None;
        let mut call_byte = 0u32;

        for cap in m.captures {
            let cap_name = &call_config.query.capture_names()[cap.index as usize];
            if *cap_name == "callee" {
                full_callee_text =
                    Some(source[cap.node.start_byte()..cap.node.end_byte()].to_string());
                #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
                {
                    call_byte = cap.node.start_byte() as u32;
                }
            }
        }

        if let Some(full_text) = full_callee_text {
            // Split qualified path into bare name + optional qualifier.
            let (name, qualified_path) = if full_text.contains("::") {
                let bare = full_text
                    .rsplit("::")
                    .next()
                    .unwrap_or(&full_text)
                    .to_string();
                (bare, Some(full_text))
            } else {
                (full_text, None)
            };

            // Look up receiver type from the pre-built map.
            let receiver_type = receiver_map.get(&call_byte).cloned();

            // Assign to the most-specific (smallest byte range) enclosing definition.
            // Using `find` (first match) was incorrect for nested defs: an `impl_item`
            // wrapping a `function_item` both contain the call site, but the
            // `function_item` is the correct granularity for method attribution.
            //
            // Tiebreak: when two defs have equal byte spans (as happens in Python where
            // the class body `block` and its first `function_definition` share the same
            // start/end bytes), prefer function-like defs over structural container defs.
            // `is_callable_def` returns 0 for function-like kinds (sorts first in min_by_key).
            let enclosing_idx = defs
                .iter()
                .enumerate()
                .filter(|(_, d)| d.start_byte <= call_byte && call_byte < d.end_byte)
                .min_by_key(|(_, d)| (d.end_byte - d.start_byte, is_callable_def_priority(&d.kind)))
                .map(|(i, _)| i);

            if let Some(idx) = enclosing_idx {
                // Skip self-recursive calls (compare bare name to def name).
                if defs[idx].name != name {
                    defs[idx].calls.push(CallRef {
                        name,
                        qualified_path,
                        receiver_type,
                        byte_offset: call_byte,
                        resolved: None,
                    });
                }
            }
            // Calls outside any definition are ignored (module-level init).
        }
    }
}

// HCL: post-pass call-edge extraction for terraform_remote_state and module
// blocks. These are not function calls — they are HCL-specific structural
// references to other Terraform modules — so the generic
// `(function_call (identifier) @callee) @call` pattern in
// `call_query_for_extension("tf")` cannot capture them. This helper runs
// once per HCL file inside `extract_calls` (R2 + R3, Wave 3).

/// Walk an HCL parse tree and emit CallRef entries for:
///
/// 1. `data.terraform_remote_state.<NAME>.outputs.<ATTR>` expressions:
///    one CallRef per reference, with `name = NAME` and
///    `qualified_path = Some("terraform_remote_state.NAME")`. These
///    connect the current file to the named remote-state module's outputs.
///
/// 2. `module "X" { source = "../X" }` blocks: one CallRef with
///    `name = X` (the label) and `qualified_path = Some("module.X")`.
///    The module reference connects to the module's directory in
///    `resolve_import` (HCL module-source resolution is not implemented
///    yet — the qualified_path carrier is the contract; resolve adds the
///    file lookup).
///
/// Each emitted CallRef is attached to the smallest enclosing definition
/// by byte range — matching the same heuristic used by `extract_calls`.
fn extract_hcl_call_edges(source: &str, root: tree_sitter::Node<'_>, defs: &mut [Definition]) {
    // Walk all named descendants iteratively.
    let mut stack: Vec<tree_sitter::Node<'_>> = vec![root];
    while let Some(node) = stack.pop() {
        // Defer to a function-style helper per node kind.
        match node.kind() {
            "expression" => hcl_visit_expression(source, node, defs),
            "block" => hcl_visit_block(source, node, defs),
            _ => {}
        }
        // Recurse into named children.
        let mut cursor = node.walk();
        for child in node.children(&mut cursor) {
            if child.is_named() {
                stack.push(child);
            }
        }
    }
}

/// Inspect an HCL `expression` node for the
/// `data.terraform_remote_state.<NAME>.outputs.<ATTR>` reference pattern.
///
/// The expression tree looks like:
/// ```text
/// expression
///   variable_expr
///     identifier "data"
///   get_attr
///     identifier "terraform_remote_state"
///   get_attr
///     identifier "<NAME>"
///   get_attr
///     identifier "outputs"
///   get_attr
///     identifier "<ATTR>"
/// ```
fn hcl_visit_expression(source: &str, node: tree_sitter::Node<'_>, defs: &mut [Definition]) {
    // Collect children: must be `variable_expr` (with identifier="data")
    // followed by a chain of `get_attr` nodes (each with an `identifier` child).
    let mut cursor = node.walk();
    let mut child_iter = node.children(&mut cursor);
    let Some(first) = child_iter.next() else {
        return;
    };
    if first.kind() != "variable_expr" {
        return;
    }
    let Some(first_id) = first.child_by_field_name("name").or_else(|| {
        // Fallback: find first named child that's an identifier.
        let mut c = first.walk();
        first.children(&mut c).find(|n| n.kind() == "identifier")
    }) else {
        return;
    };
    if &source[first_id.start_byte()..first_id.end_byte()] != "data" {
        return;
    }

    // Collect identifiers from the chain of get_attr.
    let mut chain: Vec<String> = Vec::new();
    for child in child_iter {
        if child.kind() != "get_attr" {
            return; // not a pure attribute chain
        }
        let mut gc = child.walk();
        let id = child.children(&mut gc).find(|n| n.kind() == "identifier");
        let Some(id_node) = id else { return };
        chain.push(source[id_node.start_byte()..id_node.end_byte()].to_string());
    }

    // Expect: terraform_remote_state, <NAME>, outputs, <ATTR>
    if chain.len() < 2 || chain[0] != "terraform_remote_state" {
        return;
    }
    let name = chain[1].clone();
    let qualified_path = format!("terraform_remote_state.{name}");

    #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
    let call_byte = node.start_byte() as u32;
    attach_hcl_call(defs, call_byte, name, Some(qualified_path));
}

/// Inspect an HCL `block` node for the `module "X" { source = "../X" }`
/// pattern. Emits one CallRef per matching block.
fn hcl_visit_block(source: &str, node: tree_sitter::Node<'_>, defs: &mut [Definition]) {
    // First child must be identifier="module".
    let mut cursor = node.walk();
    let children: Vec<tree_sitter::Node<'_>> = node.children(&mut cursor).collect();
    let Some(first) = children.first() else {
        return;
    };
    if first.kind() != "identifier" || &source[first.start_byte()..first.end_byte()] != "module" {
        return;
    }
    // Next child should be a string_lit (the module label).
    let label_node = children.iter().find(|c| c.kind() == "string_lit");
    let Some(label_node) = label_node else {
        return;
    };
    let mut lc = label_node.walk();
    let template = label_node
        .children(&mut lc)
        .find(|n| n.kind() == "template_literal");
    let Some(template) = template else {
        return;
    };
    let label = source[template.start_byte()..template.end_byte()].to_string();
    let qualified_path = format!("module.{label}");

    #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
    let call_byte = node.start_byte() as u32;
    attach_hcl_call(defs, call_byte, label, Some(qualified_path));
}

/// Attach a synthesized HCL CallRef to the smallest enclosing definition.
/// Mirrors the byte-range attribution from `extract_calls`.
fn attach_hcl_call(
    defs: &mut [Definition],
    call_byte: u32,
    name: String,
    qualified_path: Option<String>,
) {
    let enclosing_idx = defs
        .iter()
        .enumerate()
        .filter(|(_, d)| d.start_byte <= call_byte && call_byte < d.end_byte)
        .min_by_key(|(_, d)| (d.end_byte - d.start_byte, is_callable_def_priority(&d.kind)))
        .map(|(i, _)| i);
    if let Some(idx) = enclosing_idx {
        // Skip self-recursive emission (would happen if the enclosing def
        // happens to share the same `name` as the synthesized callee).
        if defs[idx].name != name {
            defs[idx].calls.push(CallRef {
                name,
                qualified_path,
                receiver_type: None,
                byte_offset: call_byte,
                resolved: None,
            });
        }
    }
}

/// Infer method-call receiver types from local context within a parse tree.
///
/// Returns a map from `byte_offset_of_@callee_capture` to a receiver type string.
///
/// Dispatches to a language-specific collector:
///
/// - **Rust**: [`collect_rust_receiver_types`] — three heuristic cases:
///   1. `self.method()` inside `impl Foo { … }` → `"Foo"`.
///   2. `x.method()` where `x: Bar` is a function parameter → `"Bar"`.
///   3. `x.method()` after `let x = Foo::new()` → `"Foo"`.
///
/// - **Python**: [`collect_python_receiver_types`] — two heuristic cases:
///   1. `self.method()` inside a class method → class name from enclosing
///      `class_definition`.
///   2. `instance.method()` where `instance: ClassName` type annotation or
///      `instance = ClassName(...)` assignment is visible in the same scope.
///
/// - **Go**: [`collect_go_receiver_types`] — one heuristic case:
///   1. `recv.Method()` inside a `method_declaration` where `recv` is the
///      named receiver parameter → receiver type from the method signature.
///
/// This is heuristic, not type-inference-complete. Unknown/ambiguous cases
/// produce no entry in the map; `extract_calls` leaves those `receiver_type = None`.
fn infer_receiver_types(
    source: &str,
    tree: &tree_sitter::Tree,
    language: &tree_sitter::Language,
) -> HashMap<u32, String> {
    let mut map: HashMap<u32, String> = HashMap::new();

    if languages::is_rust_language(language) {
        collect_rust_receiver_types(source, tree.root_node(), &mut map);
    } else if languages::is_python_language(language) {
        collect_python_receiver_types(source, tree.root_node(), &mut map);
    } else if languages::is_go_language(language) {
        collect_go_receiver_types(source, tree.root_node(), &mut map);
    }
    // Other languages: no receiver inference — leave map empty.

    map
}

/// Walk the Rust parse tree and fill `map` with receiver-type inference.
///
/// This is a recursive descent that tracks:
/// - The current `impl Foo` or `impl Foo for Bar` type name (for `self.*` calls).
/// - Parameter type annotations (for `x: SomeType` → `x` has type `SomeType`).
/// - Constructor let-bindings (`let x = Foo::new()` → `x` has type `Foo`).
fn collect_rust_receiver_types(
    source: &str,
    node: tree_sitter::Node<'_>,
    map: &mut HashMap<u32, String>,
) {
    // We use a stack-based walk to avoid deep recursion on large files.
    // Each stack entry carries (node, impl_type_context).
    let mut stack: Vec<(tree_sitter::Node<'_>, Option<String>)> = vec![(node, None)];

    while let Some((n, impl_ctx)) = stack.pop() {
        match n.kind() {
            "impl_item" => {
                // Extract `impl Foo` or `impl Trait for Foo` → capture the `for` type.
                // tree-sitter-rust shape: `(impl_item type: (type_identifier) @type)` for
                // inherent impls, and `(impl_item trait: … type: (type_identifier) @type)`
                // for trait impls. Both have a child named `type`.
                let impl_type = extract_impl_self_type(source, n);
                let new_ctx = impl_type.or_else(|| impl_ctx.clone());
                let mut cursor = n.walk();
                for child in n.children(&mut cursor) {
                    stack.push((child, new_ctx.clone()));
                }
            }
            "function_item" => {
                // Build parameter bindings: (param_name → type_name).
                let param_types = extract_param_types(source, n);
                // Build let-binding type map from constructor calls.
                let let_types = extract_let_binding_types(source, n);
                // Annotate call sites within this function body.
                annotate_method_calls(
                    source,
                    n,
                    impl_ctx.as_deref(),
                    &param_types,
                    &let_types,
                    map,
                );
                // Do NOT recurse into function_item children with the outer stack —
                // function bodies are fully handled by annotate_method_calls.
                // (Nested fn items would re-enter via their own impl_item context.)
                // Push children with same impl_ctx so nested impl blocks are found.
                let mut cursor = n.walk();
                for child in n.children(&mut cursor) {
                    stack.push((child, impl_ctx.clone()));
                }
            }
            _ => {
                let mut cursor = n.walk();
                for child in n.children(&mut cursor) {
                    stack.push((child, impl_ctx.clone()));
                }
            }
        }
    }
}

/// Extract the self type from an `impl_item` node.
///
/// For `impl Foo { … }` → `Some("Foo")`.
/// For `impl Trait for Foo { … }` → `Some("Foo")` (the concrete `for` type).
fn extract_impl_self_type(source: &str, impl_node: tree_sitter::Node<'_>) -> Option<String> {
    // tree-sitter-rust: impl_item has a field named "type" for the self type.
    // For `impl Foo for Bar { }`, "type" is Bar; for `impl Foo { }`, "type" is Foo.
    let type_node = impl_node.child_by_field_name("type")?;
    Some(source[type_node.start_byte()..type_node.end_byte()].to_string())
}

/// Extract parameter name → type mappings from a function signature.
///
/// Handles `fn foo(x: Bar, y: Baz)` → `{"x": "Bar", "y": "Baz"}`.
/// The `self`/`&self`/`&mut self` parameter is skipped (handled via impl_ctx).
fn extract_param_types(source: &str, fn_node: tree_sitter::Node<'_>) -> HashMap<String, String> {
    let mut params: HashMap<String, String> = HashMap::new();
    let Some(params_node) = fn_node.child_by_field_name("parameters") else {
        return params;
    };
    let mut cursor = params_node.walk();
    for param in params_node.children(&mut cursor) {
        if param.kind() == "parameter" {
            // parameter has children: pattern (identifier) and type
            let mut param_name = None;
            let mut param_type = None;
            let mut pc = param.walk();
            for child in param.children(&mut pc) {
                match child.kind() {
                    "identifier" | "mutable_specifier" if param_name.is_none() => {
                        let text = source[child.start_byte()..child.end_byte()].to_string();
                        if text != "mut" {
                            param_name = Some(text);
                        }
                    }
                    "type_identifier"
                    | "generic_type"
                    | "reference_type"
                    | "scoped_type_identifier"
                        if param_type.is_none() =>
                    {
                        // Extract the base type identifier from potentially complex types.
                        param_type = Some(extract_base_type(source, child));
                    }
                    _ => {}
                }
            }
            if let (Some(name), Some(ty)) = (param_name, param_type)
                && !ty.is_empty()
            {
                params.insert(name, ty);
            }
        }
        // Also handle typed_pattern in newer grammars
        if param.kind() == "typed_pattern" {
            let mut name_part = None;
            let mut type_part = None;
            let mut pc = param.walk();
            for child in param.children(&mut pc) {
                if child.kind() == "identifier" && name_part.is_none() {
                    name_part = Some(source[child.start_byte()..child.end_byte()].to_string());
                } else if matches!(
                    child.kind(),
                    "type_identifier"
                        | "generic_type"
                        | "reference_type"
                        | "scoped_type_identifier"
                ) && type_part.is_none()
                {
                    type_part = Some(extract_base_type(source, child));
                }
            }
            if let (Some(name), Some(ty)) = (name_part, type_part)
                && !ty.is_empty()
            {
                params.insert(name, ty);
            }
        }
    }
    params
}

/// Extract the base `TypeIdentifier` from a potentially complex type node.
///
/// For `Bar`, `&Bar`, `&mut Bar`, `Bar<T>` → returns `"Bar"`.
/// For `module::Bar` → returns `"Bar"` (bare name for matching).
fn extract_base_type(source: &str, node: tree_sitter::Node<'_>) -> String {
    match node.kind() {
        "type_identifier" => source[node.start_byte()..node.end_byte()].to_string(),
        "generic_type" | "reference_type" | "mutable_specifier" | "scoped_type_identifier" => {
            // Recurse to find the innermost type_identifier
            let mut cursor = node.walk();
            for child in node.children(&mut cursor) {
                let t = extract_base_type(source, child);
                if !t.is_empty() {
                    return t;
                }
            }
            String::new()
        }
        _ => {
            // For other nodes, try children
            let mut cursor = node.walk();
            for child in node.children(&mut cursor) {
                if child.kind() == "type_identifier" {
                    return source[child.start_byte()..child.end_byte()].to_string();
                }
            }
            String::new()
        }
    }
}

/// Scan a function body for `let x = Foo::new()` patterns.
///
/// Returns a map from local variable name to the constructor type name.
/// E.g., `let x = Foo::new();` → `{"x": "Foo"}`.
fn extract_let_binding_types(
    source: &str,
    fn_node: tree_sitter::Node<'_>,
) -> HashMap<String, String> {
    let mut bindings: HashMap<String, String> = HashMap::new();

    let Some(body) = fn_node.child_by_field_name("body") else {
        return bindings;
    };

    // Walk the function body looking for let_declaration nodes.
    let mut stack = vec![body];
    while let Some(n) = stack.pop() {
        if n.kind() == "let_declaration" {
            // let_declaration: pattern (identifier) + value (call_expression or …)
            let mut binding_name = None;
            let mut constructor_type = None;
            let mut cursor = n.walk();
            for child in n.children(&mut cursor) {
                match child.kind() {
                    "identifier" if binding_name.is_none() => {
                        binding_name =
                            Some(source[child.start_byte()..child.end_byte()].to_string());
                    }
                    "call_expression" => {
                        // Look for `Foo::new()` or `Foo::from(…)` patterns.
                        // The function child of call_expression is a scoped_identifier.
                        if let Some(func) = child.child_by_field_name("function")
                            && func.kind() == "scoped_identifier"
                        {
                            // scoped_identifier path: `Foo::new` — extract head segment.
                            let full = source[func.start_byte()..func.end_byte()].to_string();
                            let head = full.split("::").next().unwrap_or("").to_string();
                            if !head.is_empty()
                                && head.chars().next().is_some_and(char::is_uppercase)
                            {
                                constructor_type = Some(head);
                            }
                        }
                    }
                    _ => {}
                }
            }
            if let (Some(name), Some(ty)) = (binding_name, constructor_type) {
                bindings.insert(name, ty);
            }
        }
        // Push children for recursive walk.
        let mut cursor = n.walk();
        for child in n.children(&mut cursor) {
            stack.push(child);
        }
    }

    bindings
}

/// Walk a function body and annotate method-call byte offsets with receiver types.
///
/// A "method call" in the ripvec call query is:
/// `(call_expression function: (field_expression field: (field_identifier) @callee))`
///
/// The receiver is the `value` child of `field_expression`. This function
/// checks whether the receiver is:
/// - `self` → use `impl_ctx` type.
/// - An identifier matching a parameter type in `param_types`.
/// - An identifier matching a constructor let-binding in `let_types`.
fn annotate_method_calls(
    source: &str,
    fn_node: tree_sitter::Node<'_>,
    impl_ctx: Option<&str>,
    param_types: &HashMap<String, String>,
    let_types: &HashMap<String, String>,
    map: &mut HashMap<u32, String>,
) {
    // Walk the entire function (including its body) looking for call_expression nodes.
    let mut stack = vec![fn_node];
    while let Some(n) = stack.pop() {
        if n.kind() == "call_expression"
            && let Some(func) = n.child_by_field_name("function")
            && func.kind() == "field_expression"
        {
            // field_expression: value (receiver) + field (method name identifier)
            if let (Some(recv), Some(field)) = (
                func.child_by_field_name("value"),
                func.child_by_field_name("field"),
            ) {
                let recv_text = source[recv.start_byte()..recv.end_byte()].to_string();
                let receiver_type = if recv_text == "self" || recv_text == "*self" {
                    impl_ctx.map(str::to_owned)
                } else {
                    // Strip ref sigils for lookup.
                    let base = recv_text
                        .trim_start_matches('*')
                        .trim_start_matches('&')
                        .trim();
                    param_types
                        .get(base)
                        .or_else(|| let_types.get(base))
                        .cloned()
                };

                if let Some(ty) = receiver_type {
                    // The `@callee` capture byte offset is the start of the field node.
                    #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
                    let field_byte = field.start_byte() as u32;
                    map.insert(field_byte, ty);
                }
            }
        }
        let mut cursor = n.walk();
        for child in n.children(&mut cursor) {
            stack.push(child);
        }
    }
}

// ── Python receiver-type heuristic ───────────────────────────────────

/// Walk the Python parse tree and fill `map` with receiver-type inference.
///
/// Two heuristic cases:
///
/// 1. **`self.method()` inside a method** — when the `attribute` call receiver is
///    the literal text `self`, the receiver type is the name of the nearest
///    enclosing `class_definition`.
///
/// 2. **`instance.method()` with a type annotation or constructor call** —
///    when a function parameter has a PEP 484 annotation `param: ClassName` or
///    when a local assignment `param = ClassName(...)` precedes the call, the
///    receiver type is bound to `ClassName`.
///
/// The Python call query captures:
/// - `(call function: (attribute attribute: (identifier) @callee)) @call`
///
/// Within the `attribute` node, `value` is the receiver expression and
/// `attribute` is the method name (the `@callee` capture). The `@callee`
/// byte offset is the start of the `attribute` child identifier node.
fn collect_python_receiver_types(
    source: &str,
    root: tree_sitter::Node<'_>,
    map: &mut HashMap<u32, String>,
) {
    // Stack carries (node, class_ctx: Option<String>).
    // class_ctx is the name of the nearest enclosing class_definition.
    let mut stack: Vec<(tree_sitter::Node<'_>, Option<String>)> = vec![(root, None)];

    while let Some((n, class_ctx)) = stack.pop() {
        match n.kind() {
            "class_definition" => {
                // Extract the class name from the `name` child.
                let class_name = n
                    .child_by_field_name("name")
                    .map(|c| source[c.start_byte()..c.end_byte()].to_string());
                let new_ctx = class_name.or_else(|| class_ctx.clone());
                let mut cursor = n.walk();
                for child in n.children(&mut cursor) {
                    stack.push((child, new_ctx.clone()));
                }
            }
            "function_definition" => {
                // Build parameter annotation map: param_name → type_name.
                let param_types = extract_python_param_types(source, n);
                // Build local assignment map: var_name → constructor type.
                let let_types = extract_python_assignment_types(source, n);
                // Annotate attribute call sites within this function body.
                annotate_python_method_calls(
                    source,
                    n,
                    class_ctx.as_deref(),
                    &param_types,
                    &let_types,
                    map,
                );
                // Push children with same class_ctx so nested classes are found.
                let mut cursor = n.walk();
                for child in n.children(&mut cursor) {
                    stack.push((child, class_ctx.clone()));
                }
            }
            _ => {
                let mut cursor = n.walk();
                for child in n.children(&mut cursor) {
                    stack.push((child, class_ctx.clone()));
                }
            }
        }
    }
}

/// Extract Python parameter name → type annotation mappings.
///
/// Handles PEP 484 style: `def foo(self, x: Bar, y: Baz) -> ...`.
/// The `self` parameter is excluded (handled via class_ctx).
/// Returns `{"x": "Bar", "y": "Baz"}`.
fn extract_python_param_types(
    source: &str,
    fn_node: tree_sitter::Node<'_>,
) -> HashMap<String, String> {
    let mut params: HashMap<String, String> = HashMap::new();
    let Some(params_node) = fn_node.child_by_field_name("parameters") else {
        return params;
    };

    // Parameters node children include `identifier`, `typed_parameter`,
    // `typed_default_parameter`, and others.
    let mut cursor = params_node.walk();
    for param in params_node.children(&mut cursor) {
        match param.kind() {
            "typed_parameter" => {
                // (typed_parameter (identifier) @name type: (type) @type)
                // First identifier child is the name; type child is the type.
                let mut name_text = None;
                let mut type_text = None;
                let mut pc = param.walk();
                for child in param.children(&mut pc) {
                    match child.kind() {
                        "identifier" if name_text.is_none() => {
                            let t = source[child.start_byte()..child.end_byte()].to_string();
                            if t != "self" && t != "cls" {
                                name_text = Some(t);
                            }
                        }
                        "type" | "identifier" | "attribute"
                            if type_text.is_none() && name_text.is_some() =>
                        {
                            // The type child in tree-sitter-python is a `type` node
                            // whose text is the annotation expression. Extract the
                            // base identifier (handle `Optional[Bar]`, `List[Bar]`, etc.)
                            type_text = Some(extract_python_base_type(source, child));
                        }
                        _ => {}
                    }
                }
                if let (Some(name), Some(ty)) = (name_text, type_text)
                    && !ty.is_empty()
                    && !ty.eq("self")
                    && !ty.eq("cls")
                {
                    params.insert(name, ty);
                }
            }
            "typed_default_parameter" => {
                // (typed_default_parameter name: (identifier) type: (type) value: …)
                let name_node = param.child_by_field_name("name");
                let type_node = param.child_by_field_name("type");
                if let (Some(nn), Some(tn)) = (name_node, type_node) {
                    let name = source[nn.start_byte()..nn.end_byte()].to_string();
                    if name != "self" && name != "cls" {
                        let ty = extract_python_base_type(source, tn);
                        if !ty.is_empty() {
                            params.insert(name, ty);
                        }
                    }
                }
            }
            _ => {}
        }
    }
    params
}

/// Extract the base type name from a Python type annotation node.
///
/// For `Bar` → `"Bar"`. For `Optional[Bar]` or `List[Bar]` → `"Bar"`.
/// For `module.Class` → `"Class"` (bare name only).
fn extract_python_base_type(source: &str, node: tree_sitter::Node<'_>) -> String {
    match node.kind() {
        "identifier" => source[node.start_byte()..node.end_byte()].to_string(),
        // tree-sitter-python wraps annotations in a `type` node
        "type" => {
            let mut cursor = node.walk();
            for child in node.children(&mut cursor) {
                let t = extract_python_base_type(source, child);
                if !t.is_empty() {
                    return t;
                }
            }
            String::new()
        }
        // Generic alias: `Optional[Bar]` — the first identifier child is `Optional`,
        // the subscript child contains `Bar`. We want the subscript content.
        "subscript" => {
            // subscript has value (e.g. Optional) and subscript (e.g. Bar).
            // Return the subscript's base type (the inner type argument).
            if let Some(sub) = node.child_by_field_name("subscript") {
                return extract_python_base_type(source, sub);
            }
            // Fall back: first identifier
            let mut cursor = node.walk();
            for child in node.children(&mut cursor) {
                if child.kind() == "identifier" {
                    return source[child.start_byte()..child.end_byte()].to_string();
                }
            }
            String::new()
        }
        // Attribute node `module.Class` → take last identifier
        "attribute" => {
            if let Some(attr) = node.child_by_field_name("attribute") {
                return source[attr.start_byte()..attr.end_byte()].to_string();
            }
            String::new()
        }
        _ => {
            // Try first identifier child
            let mut cursor = node.walk();
            for child in node.children(&mut cursor) {
                if child.kind() == "identifier" {
                    return source[child.start_byte()..child.end_byte()].to_string();
                }
            }
            String::new()
        }
    }
}

/// Scan a Python function body for `x = ClassName(...)` assignment patterns.
///
/// Returns a map from local variable name to constructor type.
/// E.g., `x = Foo()` → `{"x": "Foo"}`.
/// Also handles `x = module.ClassName(...)` → `{"x": "ClassName"}`.
fn extract_python_assignment_types(
    source: &str,
    fn_node: tree_sitter::Node<'_>,
) -> HashMap<String, String> {
    let mut bindings: HashMap<String, String> = HashMap::new();
    let Some(body) = fn_node.child_by_field_name("body") else {
        return bindings;
    };

    let mut stack = vec![body];
    while let Some(n) = stack.pop() {
        if n.kind() == "assignment" {
            // assignment: left = right
            // We want: left is a simple identifier, right is a call whose
            // function is an identifier starting with an uppercase letter
            // (Python convention for class names).
            let left = n.child_by_field_name("left");
            let right = n.child_by_field_name("right");
            if let (Some(lhs), Some(rhs)) = (left, right)
                && lhs.kind() == "identifier"
                && rhs.kind() == "call"
                && let Some(func) = rhs.child_by_field_name("function")
            {
                let var_name = source[lhs.start_byte()..lhs.end_byte()].to_string();
                let constructor_type = match func.kind() {
                    "identifier" => {
                        let t = source[func.start_byte()..func.end_byte()].to_string();
                        // Class names are conventionally uppercase-first
                        if t.chars().next().is_some_and(char::is_uppercase) {
                            Some(t)
                        } else {
                            None
                        }
                    }
                    "attribute" => {
                        // `module.ClassName(...)` — take the `attribute` part
                        func.child_by_field_name("attribute")
                            .map(|a| source[a.start_byte()..a.end_byte()].to_string())
                    }
                    _ => None,
                };
                if let Some(ty) = constructor_type {
                    bindings.insert(var_name, ty);
                }
            }
        }
        let mut cursor = n.walk();
        for child in n.children(&mut cursor) {
            stack.push(child);
        }
    }
    bindings
}

/// Walk a Python function body and annotate attribute-call byte offsets.
///
/// A Python method call is:
/// `(call function: (attribute value: <receiver> attribute: (identifier) @callee))`
///
/// The receiver is the `value` child of `attribute`. This function checks:
/// - `self` → use the enclosing class name (`class_ctx`).
/// - An identifier matching a parameter type in `param_types`.
/// - An identifier matching a constructor assignment in `let_types`.
fn annotate_python_method_calls(
    source: &str,
    fn_node: tree_sitter::Node<'_>,
    class_ctx: Option<&str>,
    param_types: &HashMap<String, String>,
    let_types: &HashMap<String, String>,
    map: &mut HashMap<u32, String>,
) {
    let mut stack = vec![fn_node];
    while let Some(n) = stack.pop() {
        if n.kind() == "call"
            && let Some(func) = n.child_by_field_name("function")
            && func.kind() == "attribute"
            && let (Some(recv_node), Some(attr_node)) = (
                func.child_by_field_name("object"),
                func.child_by_field_name("attribute"),
            )
        {
            // attribute node: object (receiver) + attribute (method name)
            let recv_text = source[recv_node.start_byte()..recv_node.end_byte()].to_string();
            let receiver_type = if recv_text == "self" || recv_text == "cls" {
                class_ctx.map(str::to_owned)
            } else if recv_node.kind() == "identifier" {
                param_types
                    .get(&recv_text)
                    .or_else(|| let_types.get(&recv_text))
                    .cloned()
            } else {
                None
            };

            if let Some(ty) = receiver_type {
                // The `@callee` capture byte offset is the `attribute` child.
                #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
                let attr_byte = attr_node.start_byte() as u32;
                map.insert(attr_byte, ty);
            }
        }
        let mut cursor = n.walk();
        for child in n.children(&mut cursor) {
            stack.push(child);
        }
    }
}

// ── Python class hierarchy (MRO) extraction ───────────────────────────

/// Walk the Python parse tree and add class → parent-names entries to `out`.
///
/// tree-sitter-python shape:
/// ```text
/// (class_definition
///   name: (identifier) @child
///   superclasses: (argument_list
///     (identifier) @parent             ; bare parent: class Foo(Bar):
///     (attribute attribute: (identifier) @parent) ; qualified: class Foo(mod.Bar):
///     (keyword_argument …)             ; ignored: class Foo(Bar, metaclass=Meta):
///   )?
///   ...)
/// ```
///
/// Each map entry's key is a class defined in the file; the value is the
/// ordered list of declared parent class **names** (the trailing `attribute`
/// segment, so `mod.Bar` becomes `Bar`). Classes with no declared parents
/// still get an entry (empty `Vec`) so a downstream MRO walk can tell
/// "known class with no parents" from "unknown class".
fn extract_python_class_hierarchy_node(
    source: &str,
    root: tree_sitter::Node<'_>,
    out: &mut HashMap<String, Vec<String>>,
) {
    let mut stack = vec![root];
    while let Some(n) = stack.pop() {
        if n.kind() == "class_definition"
            && let Some(name_node) = n.child_by_field_name("name")
        {
            let class_name = source[name_node.start_byte()..name_node.end_byte()].to_string();
            let mut parents: Vec<String> = Vec::new();
            if let Some(superclasses) = n.child_by_field_name("superclasses") {
                // superclasses is an `argument_list`; iterate its children and
                // collect identifiers / attribute trailing-names. Skip
                // keyword_argument entries (metaclass=…, etc.) and punctuation.
                let mut sc = superclasses.walk();
                for child in superclasses.children(&mut sc) {
                    match child.kind() {
                        "identifier" => {
                            let t = source[child.start_byte()..child.end_byte()].to_string();
                            parents.push(t);
                        }
                        "attribute" => {
                            // module.Cls → take the trailing `attribute` segment.
                            if let Some(attr) = child.child_by_field_name("attribute") {
                                parents
                                    .push(source[attr.start_byte()..attr.end_byte()].to_string());
                            }
                        }
                        // Drop keyword_argument, "(", ")", ",", comments, etc.
                        _ => {}
                    }
                }
            }
            out.insert(class_name, parents);
        }
        let mut cursor = n.walk();
        for child in n.children(&mut cursor) {
            stack.push(child);
        }
    }
}

/// Extract the Python `class → [parents]` map from a single source file by
/// parsing it with tree-sitter-python.
///
/// Returns an empty map when the source fails to parse or contains no
/// `class_definition` nodes. The returned map is the per-file contribution
/// to the global hierarchy used by [`resolve_calls_with_python_mro_pub`]
/// for MRO-aware receiver-type dispatch (Q1, Wave 2).
#[must_use]
pub fn extract_python_class_hierarchy(source: &str) -> HashMap<String, Vec<String>> {
    let mut parser = Parser::new();
    let lang: tree_sitter::Language = tree_sitter_python::LANGUAGE.into();
    if parser.set_language(&lang).is_err() {
        return HashMap::new();
    }
    let Some(tree) = parser.parse(source, None) else {
        return HashMap::new();
    };
    let mut out: HashMap<String, Vec<String>> = HashMap::new();
    extract_python_class_hierarchy_node(source, tree.root_node(), &mut out);
    out
}

/// Compute the linearised MRO (Method Resolution Order) for a Python class
/// name using a **simplified left-first depth-first walk** of the declared
/// `class → [parents]` hierarchy.
///
/// Python's real MRO uses C3 linearisation, which is monotonic and respects
/// declaration order across the diamond inheritance shape. For ripvec's
/// reverse-call-graph purpose we want *any plausible ancestor* of the
/// receiver type — including ancestors only reachable via a mixin — so we
/// can resolve `self.method()` calls whose dispatch lands on an ancestor.
/// The simplification: pre-order DFS, left-to-right, skipping cycles via a
/// `visited` set.
///
/// On a non-diamond shape this matches C3 exactly. On a diamond the
/// simplified walk may surface an ancestor earlier than C3 would, but every
/// ancestor C3 would visit is still reached — and ripvec's goal is "find
/// the implementing def for an inherited call", not "compute the runtime
/// dispatch winner". Over-approximating ancestors only matters when two
/// ancestors define the same method, and even then the left-first order
/// matches C3 on the common patterns
/// (`class Sub(Base, Mixin)` → `Sub, Base, Mixin, <Base's ancestors>, <Mixin's ancestors>`).
///
/// The returned list excludes the start class itself. Each entry appears at
/// most once even when reachable through multiple parent chains.
fn compute_python_mro<H: std::hash::BuildHasher>(
    start: &str,
    hierarchy: &HashMap<String, Vec<String>, H>,
) -> Vec<String> {
    use std::collections::HashSet;
    let mut order: Vec<String> = Vec::new();
    let mut visited: HashSet<String> = HashSet::new();
    // Start with the immediate parents of `start` (the receiver-type's own
    // scope was already searched by Priority 2's direct match).
    let Some(start_parents) = hierarchy.get(start) else {
        return order;
    };
    // DFS stack: we push in reverse so pop yields left-first order.
    let mut stack: Vec<String> = start_parents.iter().rev().cloned().collect();
    while let Some(cls) = stack.pop() {
        if !visited.insert(cls.clone()) {
            continue;
        }
        order.push(cls.clone());
        if let Some(parents) = hierarchy.get(&cls) {
            for p in parents.iter().rev() {
                if !visited.contains(p) {
                    stack.push(p.clone());
                }
            }
        }
    }
    order
}

// ── Go receiver-type heuristic ────────────────────────────────────────

/// Walk the Go parse tree and fill `map` with receiver-type inference.
///
/// One heuristic case: **`recv.Method()` inside a `method_declaration`**.
///
/// Go methods have an explicit receiver parameter in their signature:
/// `func (r *Foo) Bar() { r.Baz() }` — `r` is bound to type `Foo`.
///
/// The Go call query captures:
/// `(call_expression function: (selector_expression field: (field_identifier) @callee))`
///
/// Within `selector_expression`, `operand` is the receiver expression and
/// `field` is the method name (the `@callee` capture).
///
/// This function also handles `self.method()` patterns for cases where code
/// uses `self` as a receiver name (not idiomatic Go, but it occurs).
fn collect_go_receiver_types(
    source: &str,
    root: tree_sitter::Node<'_>,
    map: &mut HashMap<u32, String>,
) {
    // Stack carries (node, receiver_binding: Option<(recv_name, recv_type)>).
    let mut stack: Vec<(tree_sitter::Node<'_>, Option<(String, String)>)> = vec![(root, None)];

    while let Some((n, recv_binding)) = stack.pop() {
        if n.kind() == "method_declaration" {
            // Extract the receiver name and type from the method signature.
            let binding = extract_go_receiver_binding(source, n);
            let new_binding = binding.or_else(|| recv_binding.clone());
            let mut cursor = n.walk();
            for child in n.children(&mut cursor) {
                stack.push((child, new_binding.clone()));
            }
        } else {
            // For any call_expression whose function is a selector_expression,
            // check if the operand matches the active receiver binding.
            if n.kind() == "call_expression"
                && let Some(func) = n.child_by_field_name("function")
                && func.kind() == "selector_expression"
                && let (Some(operand), Some(field)) = (
                    func.child_by_field_name("operand"),
                    func.child_by_field_name("field"),
                )
            {
                let recv_text = source[operand.start_byte()..operand.end_byte()].to_string();
                let receiver_type = recv_binding.as_ref().and_then(|(recv_name, recv_ty)| {
                    if recv_text == *recv_name {
                        Some(recv_ty.clone())
                    } else {
                        None
                    }
                });

                if let Some(ty) = receiver_type {
                    // The `@callee` capture byte offset is the `field` child.
                    #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
                    let field_byte = field.start_byte() as u32;
                    map.insert(field_byte, ty);
                }
            }

            let mut cursor = n.walk();
            for child in n.children(&mut cursor) {
                stack.push((child, recv_binding.clone()));
            }
        }
    }
}

/// Extract the receiver name and base type from a Go `method_declaration`.
///
/// Go method declaration shape (tree-sitter-go):
/// ```text
/// (method_declaration
///   receiver: (parameter_list
///     (parameter_declaration
///       name: (identifier)       ← receiver name
///       type: (type_identifier   ← receiver type (bare)
///            | pointer_type (type_identifier)) ← or *Type
///     )
///   )
///   name: (field_identifier)
///   ...
/// )
/// ```
///
/// Returns `Some((recv_name, type_name))` or `None` if the receiver is unnamed
/// (blank identifier `_`) or has an unrecognisable shape.
fn extract_go_receiver_binding(
    source: &str,
    method_node: tree_sitter::Node<'_>,
) -> Option<(String, String)> {
    let receiver_list = method_node.child_by_field_name("receiver")?;
    // parameter_list contains one parameter_declaration
    let mut cursor = receiver_list.walk();
    for param in receiver_list.children(&mut cursor) {
        if param.kind() == "parameter_declaration" {
            let name_node = param.child_by_field_name("name");
            let type_node = param.child_by_field_name("type");
            if let (Some(nn), Some(tn)) = (name_node, type_node) {
                let name = source[nn.start_byte()..nn.end_byte()].to_string();
                if name == "_" || name.is_empty() {
                    return None;
                }
                let ty = extract_go_base_type(source, tn);
                if !ty.is_empty() {
                    return Some((name, ty));
                }
            }
        }
    }
    None
}

/// Extract the base type name from a Go type node.
///
/// For `Foo` (type_identifier) → `"Foo"`.
/// For `*Foo` (pointer_type → type_identifier) → `"Foo"`.
fn extract_go_base_type(source: &str, node: tree_sitter::Node<'_>) -> String {
    match node.kind() {
        "type_identifier" => source[node.start_byte()..node.end_byte()].to_string(),
        "pointer_type" => {
            // pointer_type has one child: the pointee type
            let mut cursor = node.walk();
            for child in node.children(&mut cursor) {
                if child.kind() == "type_identifier" {
                    return source[child.start_byte()..child.end_byte()].to_string();
                }
                let t = extract_go_base_type(source, child);
                if !t.is_empty() {
                    return t;
                }
            }
            String::new()
        }
        _ => {
            let mut cursor = node.walk();
            for child in node.children(&mut cursor) {
                if child.kind() == "type_identifier" {
                    return source[child.start_byte()..child.end_byte()].to_string();
                }
            }
            String::new()
        }
    }
}

/// Enrich Go `method_declaration` definition scopes with their receiver type name.
///
/// In the generic `extract_definitions` path, `build_scope_chain` walks the
/// *parent* chain of the `@def` node. For Go `method_declaration`, the parent
/// is the file root — so the scope is always `""`.
///
/// An empty scope means `resolve_calls` Priority 2 (receiver-type matching via
/// `scope.contains(recv_type)`) never fires for Go methods. Cross-file calls
/// where the caller inferred `receiver_type = Some("Foo")` stay unresolved;
/// no edge is recorded; `def_callers[]` stays empty for those defs — the root
/// cause of the missing inverse index for Go (I#P1).
///
/// Fix: after `extract_definitions`, parse the Go source a second time to find
/// each `method_declaration`'s receiver type, then set the matching def's scope
/// to `"method_declaration {ReceiverType}"`.  This matches the pattern used by
/// the existing `go_resolve_receiver_method_via_signature` integration test,
/// which asserts that `scope.contains("Foo")` succeeds when the scope is
/// `"method_declaration Foo"`.
///
/// Matching is by `start_byte` (precise) so name collisions across different
/// receiver types are handled correctly.
fn enrich_go_method_def_scopes(source: &str, defs: &mut [Definition]) {
    let go_lang: tree_sitter::Language = tree_sitter_go::LANGUAGE.into();
    let mut parser = Parser::new();
    if parser.set_language(&go_lang).is_err() {
        return;
    }
    let Some(tree) = parser.parse(source, None) else {
        return;
    };

    // Walk all top-level method_declaration nodes.
    let root = tree.root_node();
    let mut method_cursor = root.walk();
    for child in root.children(&mut method_cursor) {
        if child.kind() != "method_declaration" {
            continue;
        }
        let Some((_, recv_type)) = extract_go_receiver_binding(source, child) else {
            continue;
        };
        // Match by start_byte (precise): the @def node for method_declaration in
        // the Go definition query is the method_declaration node itself, so its
        // start_byte matches the def's start_byte recorded during extract_definitions.
        #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
        let method_start_byte = child.start_byte() as u32;
        for def in defs.iter_mut() {
            if def.kind == "method_declaration" && def.start_byte == method_start_byte {
                def.scope = format!("method_declaration {recv_type}");
                break;
            }
        }
    }
}

/// Public wrapper for `enrich_go_method_def_scopes` — enables integration tests
/// to call it directly without going through the full `build_graph` pipeline.
pub fn enrich_go_method_def_scopes_pub(source: &str, defs: &mut [Definition]) {
    enrich_go_method_def_scopes(source, defs);
}

/// SQL: prepend a synthetic whole-file definition whose name is the filename stem.
///
/// dbt and sqlmesh follow a filename-as-model-name convention:
///
/// - `silver_issuer_returns.sql` defines the `silver_issuer_returns` model.
/// - `gold_issuer_returns.sql` references the silver model by filename stem,
///   not by any in-source CREATE TABLE.
///
/// In sqlmesh, the in-source name is templated:
///
/// ```sql
/// MODEL (
///   name @{athena_sqlmesh_silver_schema}.issuer_returns,
///   ...
/// );
/// SELECT ... FROM @{athena_sqlmesh_silver_schema}.stg_issuer_returns;
/// ```
///
/// The `MODEL (...)` header parses as an ERROR node under tree-sitter-sequel
/// because `@{var}` is not standard SQL; FROM/JOIN further down the file still
/// extract cleanly. Without a synthetic def, `lsp_workspace_symbols(query=
/// "silver_issuer_returns")` returns no hits — there is no real CREATE TABLE
/// in the file and the model name is interpolation only.
///
/// This helper prepends a definition with:
/// - `name` = filename stem (e.g., `silver_issuer_returns`)
/// - `kind` = `"sql_file"` (maps to `LSP SymbolKind::File` in
///   [`languages::lsp_symbol_kind_for_node_kind`])
/// - byte range = the entire source `[0, source.len())`
/// - scope / signature / qualified_name = empty / None
///
/// The whole-file byte range is the key to FROM/JOIN attribution: when
/// `extract_calls` later places a CallRef from a FROM clause that is not
/// inside any CTE or other smaller def, the smallest-enclosing-def search
/// lands on this synthetic file def and the edge is recorded.
///
/// If the filename has no stem (empty / `..`), the helper is a no-op.
/// Idempotent: if a `sql_file` def already exists at byte 0, it is left alone.
pub(crate) fn enrich_sql_file_def(filename: &str, source: &str, defs: &mut Vec<Definition>) {
    // Idempotency: do nothing if a sql_file def is already present at byte 0.
    if defs
        .iter()
        .any(|d| d.kind == "sql_file" && d.start_byte == 0)
    {
        return;
    }

    // Derive the filename stem (last path component, file extension stripped).
    let stem = std::path::Path::new(filename)
        .file_stem()
        .and_then(|s| s.to_str())
        .unwrap_or_default();
    if stem.is_empty() {
        return;
    }

    // Count newlines so end_line is reasonable for downstream UI.
    let end_line_zero_based = source.bytes().filter(|&b| b == b'\n').count();
    #[expect(clippy::cast_possible_truncation, reason = "line counts fit in u32")]
    let end_line = (end_line_zero_based as u32) + 1;
    #[expect(clippy::cast_possible_truncation, reason = "byte offsets fit in u32")]
    let end_byte = source.len() as u32;

    let file_def = Definition {
        name: stem.to_string(),
        kind: "sql_file".to_string(),
        start_line: 1,
        end_line,
        scope: String::new(),
        signature: None,
        start_byte: 0,
        end_byte,
        calls: vec![],
    };
    // Prepend so it remains the outermost (largest) enclosing def at byte 0,
    // ensuring narrow CTE defs are still preferred for inner-FROM attribution
    // (the smallest-enclosing rule in `extract_calls`).
    defs.insert(0, file_def);
}

/// Public wrapper for [`enrich_sql_file_def`] — enables integration tests
/// to call it directly without going through the full `build_graph` pipeline.
///
/// This is `pub` (not `pub(crate)`) because integration tests in
/// `crates/ripvec-core/tests/` are in a separate crate and cannot access
/// `pub(crate)` items.
pub fn enrich_sql_file_def_pub(filename: &str, source: &str, defs: &mut Vec<Definition>) {
    enrich_sql_file_def(filename, source, defs);
}

/// Public wrapper for `extract_calls` — enables integration tests to call it
/// directly without going through the full `build_graph` pipeline.
///
/// This is `pub` (not `pub(crate)`) because integration tests in
/// `crates/ripvec-core/tests/` are in a separate crate and cannot access
/// `pub(crate)` items.
pub fn extract_calls_pub(
    source: &str,
    call_config: &languages::CallConfig,
    defs: &mut [Definition],
) {
    extract_calls(source, call_config, defs);
}

/// Build an index from definition name to list of `DefId`s.
#[must_use]
pub fn build_def_index_pub(files: &[FileNode]) -> HashMap<String, Vec<DefId>> {
    build_def_index(files)
}

fn build_def_index(files: &[FileNode]) -> HashMap<String, Vec<DefId>> {
    let mut index: HashMap<String, Vec<DefId>> = HashMap::new();
    for (file_idx, file) in files.iter().enumerate() {
        for (def_idx, def) in file.defs.iter().enumerate() {
            #[expect(clippy::cast_possible_truncation)]
            let did: DefId = (file_idx as u32, def_idx as u16);
            index.entry(def.name.clone()).or_default().push(did);
        }
    }
    index
}

/// Resolve call references to target definitions.
///
/// Resolution priority:
///
/// 1. **Qualified path** (`qualified_path = Some("mod_a::foo")`): filter candidates
///    by qualifier match (file path or scope contains the qualifier segment). Unique
///    match → resolve; ambiguous or no match → leave `None`.
/// 2. **Receiver type — direct scope match** (`receiver_type = Some("Foo")`):
///    for method calls, prefer candidates whose `scope` contains the receiver
///    type name (e.g., `"impl_item Foo"`). Among receiver-matching candidates,
///    further prefer those in imported files. Unique match → resolve;
///    ambiguous → leave `None`.
///    When this step finds nothing on the receiver class itself, sub-step 2b
///    (Python MRO walk) runs: when `Foo` has a recorded parent chain, walk
///    the receiver class's MRO (left-first DFS) and try the scope-match
///    against each ancestor's name. First ancestor with a matching candidate
///    wins. See [`compute_python_mro`] for the simplification rationale (it
///    diverges from C3 only on diamond shapes where two ancestors define the
///    same name).
/// 3. **Same file** (unqualified, no receiver): prefer definitions in the caller's
///    own file.
/// 4. **SQL suffix-match** (sql_file callers only, no exact-name match): when
///    the caller def has `kind = "sql_file"` and the bare `call_name` (e.g.,
///    `"issuer_returns"`) has no exact entry in the def index, scan all
///    `sql_file` defs for names ending with `_<call_name>` (e.g.,
///    `"silver_issuer_returns"`). This bridges dbt / sqlmesh layered schema
///    prefixes: `gold_issuer_returns.sql` uses `FROM @{schema}.issuer_returns`
///    which tree-sitter reduces to the bare name `"issuer_returns"`, while the
///    target def is the synthetic `sql_file` def named `"silver_issuer_returns"`.
///    Unique suffix-match → resolve; ambiguous (multiple layers match) or
///    no match → leave `None`. Non-sql_file callers are explicitly excluded.
/// 5. **Imported file** (unqualified, no receiver): check definitions in files this
///    file imports. Unique imported candidate → resolve.
/// 6. **Ambiguous or unresolved**: leave `resolved` as `None` (no silent first-wins).
///
/// Equivalent to [`resolve_calls_with_python_mro_pub`] with an empty MRO map
/// (Priority 2.5 is a no-op).
pub fn resolve_calls_pub<S: std::hash::BuildHasher>(
    files: &mut [FileNode],
    def_index: &HashMap<String, Vec<DefId>, S>,
) {
    let empty: HashMap<String, Vec<String>> = HashMap::new();
    resolve_calls_inner(files, def_index, &empty);
}

/// Resolve call references with MRO-aware Python receiver dispatch enabled.
///
/// Identical to [`resolve_calls_pub`] except that Priority 2.5 (the MRO walk)
/// fires when the caller passes a non-empty `python_class_hierarchy`.
/// `build_graph` populates the hierarchy by parsing every Python source file
/// with [`extract_python_class_hierarchy`] and merging the per-file maps.
///
/// Tests that want to exercise MRO resolution without going through
/// `build_graph` can call this directly with a synthetic hierarchy.
pub fn resolve_calls_with_python_mro_pub<S, H>(
    files: &mut [FileNode],
    def_index: &HashMap<String, Vec<DefId>, S>,
    python_class_hierarchy: &HashMap<String, Vec<String>, H>,
) where
    S: std::hash::BuildHasher,
    H: std::hash::BuildHasher,
{
    resolve_calls_inner(files, def_index, python_class_hierarchy);
}

fn resolve_calls<S, H>(
    files: &mut [FileNode],
    def_index: &HashMap<String, Vec<DefId>, S>,
    python_class_hierarchy: &HashMap<String, Vec<String>, H>,
) where
    S: std::hash::BuildHasher,
    H: std::hash::BuildHasher,
{
    resolve_calls_inner(files, def_index, python_class_hierarchy);
}

#[expect(
    clippy::too_many_lines,
    reason = "7-priority resolution cascade (qualified path, receiver type, MRO walk, same-file, \
              SQL suffix-match, imported-file, ambiguous); each priority is a distinct decision \
              branch and extracting helpers would require passing large shared state across boundaries"
)]
fn resolve_calls_inner<S, H>(
    files: &mut [FileNode],
    def_index: &HashMap<String, Vec<DefId>, S>,
    python_class_hierarchy: &HashMap<String, Vec<String>, H>,
) where
    S: std::hash::BuildHasher,
    H: std::hash::BuildHasher,
{
    // Pre-compute imported file sets for each file.
    let imported_files: Vec<std::collections::HashSet<u32>> = files
        .iter()
        .map(|f| {
            f.imports
                .iter()
                .filter_map(|imp| imp.resolved_idx)
                .collect()
        })
        .collect();

    for file_idx in 0..files.len() {
        for def_idx in 0..files[file_idx].defs.len() {
            for call_idx in 0..files[file_idx].defs[def_idx].calls.len() {
                let call_name = files[file_idx].defs[def_idx].calls[call_idx].name.clone();
                let qualified_path = files[file_idx].defs[def_idx].calls[call_idx]
                    .qualified_path
                    .clone();
                let receiver_type = files[file_idx].defs[def_idx].calls[call_idx]
                    .receiver_type
                    .clone();

                // HCL: dedicated resolution for `terraform_remote_state.<NAME>`
                // and `module.<NAME>` qualified paths. Aurora's module DAG is
                // expressed by these patterns; resolve to the first def in any
                // file under a `/<NAME>/` directory segment. This is the
                // module-source contract (R2 + R3, Wave 3).
                if let Some(ref qpath) = qualified_path
                    && (qpath.starts_with("terraform_remote_state.")
                        || qpath.starts_with("module."))
                {
                    let target = &call_name; // already the bare module label
                    let segment_match = format!("/{target}/");
                    let alt_segment_prefix = format!("{target}/"); // when path starts with target dir
                    let candidate = files.iter().enumerate().find_map(|(idx, f)| {
                        if f.path.contains(&segment_match)
                            || f.path.starts_with(&alt_segment_prefix)
                        {
                            // Pick the first def in the file (or skip if file
                            // has no defs).
                            if !f.defs.is_empty() {
                                #[expect(
                                    clippy::cast_possible_truncation,
                                    reason = "file index fits in u32"
                                )]
                                {
                                    return Some((idx as u32, 0u16));
                                }
                            }
                        }
                        None
                    });
                    if let Some(did) = candidate {
                        files[file_idx].defs[def_idx].calls[call_idx].resolved = Some(did);
                    }
                    continue;
                }

                // ── Priority 1: Qualified-path resolution ────────────────
                //
                // `qualified_path` carries the full scoped path (e.g. "mod_a::foo").
                // We look up candidates by the bare `call_name`, then filter by
                // whether the file path or scope contains the qualifier prefix.
                if let Some(ref qpath) = qualified_path {
                    // Qualifier is everything before the final `::`.
                    let qualifier = if let Some(pos) = qpath.rfind("::") {
                        &qpath[..pos]
                    } else {
                        qpath.as_str()
                    };
                    let qual_segments: Vec<&str> = qualifier.split("::").collect();

                    let Some(candidates) = def_index.get(&call_name) else {
                        continue;
                    };

                    let matching: Vec<DefId> = candidates
                        .iter()
                        .copied()
                        .filter(|&(f_idx, _)| {
                            let file_path = &files[f_idx as usize].path;
                            let last_segment = qual_segments.last().copied().unwrap_or("");
                            let path_as_module =
                                file_path.trim_end_matches(".rs").replace(['/', '\\'], "::");
                            path_as_module.contains(last_segment)
                                || file_path.contains(last_segment)
                        })
                        .collect();

                    if matching.len() == 1 {
                        files[file_idx].defs[def_idx].calls[call_idx].resolved = Some(matching[0]);
                    }
                    // Ambiguous or no match → leave None.
                    continue;
                }

                // ── Priority 3.5: SQL suffix-match resolution ────────────
                //
                // dbt / sqlmesh pipelines use layered schema prefixes:
                // `silver_issuer_returns.sql` defines the silver-layer model.
                // `gold_issuer_returns.sql` references it via a FROM clause
                // that tree-sitter parses as `name: "issuer_returns"` (the
                // `name:` field-selector strips the `@{schema}.` prefix).
                // `def_index.get("issuer_returns")` returns None — the def is
                // stored under "silver_issuer_returns".
                //
                // When: (a) no exact-name candidate exists, AND (b) the
                // caller's enclosing def is a sql_file (whole-file synthetic
                // def emitted by `enrich_sql_file_def`), walk every sql_file
                // def in the graph and check whether its name ends with
                // `_<call_name>`. Unique suffix-match → resolve. Ambiguous
                // (e.g., both gold_ and silver_ match the same bare name) →
                // leave None (no silent first-wins).
                //
                // Non-sql_file callers are explicitly excluded: a Rust
                // function_item whose call_name happens to end with a suffix
                // of some sql_file def must NOT be resolved via this path.
                if !def_index.contains_key(&call_name)
                    && files[file_idx].defs[def_idx].kind == "sql_file"
                    && !call_name.is_empty()
                {
                    let suffix = format!("_{call_name}");
                    let suffix_str = suffix.as_str();
                    // Exclude the caller def itself from the suffix scan:
                    // `gold_issuer_returns` also ends with `_issuer_returns`
                    // but must not self-resolve.
                    #[expect(clippy::cast_possible_truncation, reason = "file index fits in u32")]
                    let caller_did: DefId = (file_idx as u32, def_idx as u16);
                    let suffix_matches: Vec<DefId> = files
                        .iter()
                        .enumerate()
                        .flat_map(|(f_idx, f)| {
                            f.defs.iter().enumerate().filter_map(move |(d_idx, d)| {
                                #[expect(
                                    clippy::cast_possible_truncation,
                                    reason = "file and def indices fit in u32/u16"
                                )]
                                let did: DefId = (f_idx as u32, d_idx as u16);
                                if d.kind == "sql_file"
                                    && d.name.ends_with(suffix_str)
                                    && did != caller_did
                                {
                                    Some(did)
                                } else {
                                    None
                                }
                            })
                        })
                        .collect();
                    if suffix_matches.len() == 1 {
                        files[file_idx].defs[def_idx].calls[call_idx].resolved =
                            Some(suffix_matches[0]);
                    }
                    // Ambiguous (>1) or no match (0) → leave None.
                    continue;
                }

                let Some(candidates) = def_index.get(&call_name) else {
                    continue;
                };

                // ── Priority 2: Receiver-type resolution ─────────────────
                //
                // `receiver_type = Some("Foo")` means this is a method call on a
                // value whose type is `Foo`. Filter candidates to those whose scope
                // chain contains the receiver type name.
                if let Some(ref rtype) = receiver_type {
                    // Candidates whose scope contains the receiver type name.
                    let receiver_matching: Vec<DefId> = candidates
                        .iter()
                        .copied()
                        .filter(|&(f_idx, d_idx)| {
                            let scope = &files[f_idx as usize].defs[d_idx as usize].scope;
                            scope.contains(rtype.as_str())
                        })
                        .collect();

                    if receiver_matching.len() == 1 {
                        files[file_idx].defs[def_idx].calls[call_idx].resolved =
                            Some(receiver_matching[0]);
                        continue;
                    }

                    if receiver_matching.len() > 1 {
                        // Among receiver-matching candidates, prefer those in imported files.
                        let imported_receiver_matching: Vec<DefId> = receiver_matching
                            .iter()
                            .copied()
                            .filter(|(f, _)| imported_files[file_idx].contains(f))
                            .collect();
                        if imported_receiver_matching.len() == 1 {
                            files[file_idx].defs[def_idx].calls[call_idx].resolved =
                                Some(imported_receiver_matching[0]);
                        }
                        // Ambiguous even after import filter → leave None.
                        continue;
                    }

                    // ── Priority 2.5: Python MRO walk ─────────────────────
                    //
                    // The receiver type's own scope has no matching def — but
                    // the method may live on an ancestor class. Walk the MRO
                    // (left-first DFS) and try the scope-match against each
                    // ancestor's name. First ancestor with at least one
                    // scope-matching candidate wins; if multiple candidates
                    // match for the same ancestor, prefer imported files,
                    // else take the first in stable order.
                    //
                    // Liskov: a subclass's `self.method()` call must dispatch
                    // through the MRO; over-approximating ancestors is the
                    // correct conservative move for a reverse call graph.
                    // For non-Python languages or Python receivers with no
                    // recorded parents, `compute_python_mro` returns an
                    // empty vector and this loop is a no-op.
                    let mro = compute_python_mro(rtype, python_class_hierarchy);
                    let mut resolved_via_mro: Option<DefId> = None;
                    for ancestor in &mro {
                        let ancestor_matching: Vec<DefId> = candidates
                            .iter()
                            .copied()
                            .filter(|&(f_idx, d_idx)| {
                                let scope = &files[f_idx as usize].defs[d_idx as usize].scope;
                                scope.contains(ancestor.as_str())
                            })
                            .collect();
                        if ancestor_matching.len() == 1 {
                            resolved_via_mro = Some(ancestor_matching[0]);
                            break;
                        }
                        if ancestor_matching.len() > 1 {
                            // Prefer imported files among the ancestor matches.
                            let imported_ancestor: Vec<DefId> = ancestor_matching
                                .iter()
                                .copied()
                                .filter(|(f, _)| imported_files[file_idx].contains(f))
                                .collect();
                            if imported_ancestor.len() == 1 {
                                resolved_via_mro = Some(imported_ancestor[0]);
                                break;
                            }
                            // Ambiguous at this ancestor — pick the first
                            // candidate in stable order. The MRO walk's job
                            // is to find *an* implementing def for an
                            // inherited call, not to compute the runtime
                            // dispatch winner; any plausible candidate is
                            // useful for the reverse call graph.
                            resolved_via_mro = Some(ancestor_matching[0]);
                            break;
                        }
                    }
                    if let Some(did) = resolved_via_mro {
                        files[file_idx].defs[def_idx].calls[call_idx].resolved = Some(did);
                        continue;
                    }
                    // No receiver-matching candidates anywhere in the MRO →
                    // fall through to bare-name resolution.
                }

                // ── Priority 3: Same-file resolution ─────────────────────
                #[expect(clippy::cast_possible_truncation)]
                let file_idx_u32 = file_idx as u32;
                if let Some(&did) = candidates.iter().find(|(f, _)| *f == file_idx_u32) {
                    files[file_idx].defs[def_idx].calls[call_idx].resolved = Some(did);
                    continue;
                }

                // ── Priority 4: Imported-file resolution ──────────────────
                let imported_candidates: Vec<DefId> = candidates
                    .iter()
                    .copied()
                    .filter(|(f, _)| imported_files[file_idx].contains(f))
                    .collect();
                if imported_candidates.len() == 1 {
                    files[file_idx].defs[def_idx].calls[call_idx].resolved =
                        Some(imported_candidates[0]);
                }
                // Priority 5: Ambiguous or unresolved → leave None.
            }
        }
    }
}

/// Compute a prefix-sum offset table for flattening `DefId`s to linear indices.
fn def_offsets(files: &[FileNode]) -> Vec<usize> {
    let mut offsets = Vec::with_capacity(files.len() + 1);
    offsets.push(0);
    for file in files {
        offsets.push(offsets.last().unwrap() + file.defs.len());
    }
    offsets
}

/// Flatten a `DefId` to a linear index using the offset table.
fn flatten_def_id(offsets: &[usize], did: DefId) -> usize {
    offsets[did.0 as usize] + did.1 as usize
}

/// Build top-N caller and callee lists for each definition (flattened).
fn build_def_neighbor_lists(
    n: usize,
    edges: &[(u32, u32, u32)],
    offsets: &[usize],
) -> (Vec<Vec<DefId>>, Vec<Vec<DefId>>) {
    let mut incoming: Vec<Vec<(u32, u32)>> = vec![vec![]; n];
    let mut outgoing: Vec<Vec<(u32, u32)>> = vec![vec![]; n];

    for &(src, dst, w) in edges {
        let (s, d) = (src as usize, dst as usize);
        if s < n && d < n {
            incoming[d].push((src, w));
            outgoing[s].push((dst, w));
        }
    }

    // Convert flat index back to DefId
    let to_def_id = |flat: u32| -> DefId {
        let flat_usize = flat as usize;
        let file_idx = offsets.partition_point(|&o| o <= flat_usize) - 1;
        let def_idx = flat_usize - offsets[file_idx];
        #[expect(clippy::cast_possible_truncation)]
        (file_idx as u32, def_idx as u16)
    };

    let callers = incoming
        .into_iter()
        .map(|mut v| {
            v.sort_by_key(|b| std::cmp::Reverse(b.1));
            v.truncate(MAX_NEIGHBORS);
            v.into_iter().map(|(idx, _)| to_def_id(idx)).collect()
        })
        .collect();

    let callees = outgoing
        .into_iter()
        .map(|mut v| {
            v.sort_by_key(|b| std::cmp::Reverse(b.1));
            v.truncate(MAX_NEIGHBORS);
            v.into_iter().map(|(idx, _)| to_def_id(idx)).collect()
        })
        .collect();

    (callers, callees)
}

// ── PageRank ─────────────────────────────────────────────────────────

/// Compute `PageRank` scores for a graph.
///
/// If `focus` is `Some(idx)`, computes topic-sensitive `PageRank` biased
/// toward file `idx`. Otherwise computes standard (uniform) `PageRank`.
///
/// Returns one score per node, summing to 1.0.
#[expect(
    clippy::cast_precision_loss,
    reason = "node count fits comfortably in f32"
)]
fn pagerank(n: usize, edges: &[(u32, u32, u32)], focus: Option<usize>) -> Vec<f32> {
    if n == 0 {
        return vec![];
    }

    // Build adjacency: out_edges[src] = [(dst, weight)]
    let mut out_edges: Vec<Vec<(usize, f32)>> = vec![vec![]; n];
    let mut out_weight: Vec<f32> = vec![0.0; n];

    for &(src, dst, w) in edges {
        let (s, d) = (src as usize, dst as usize);
        if s < n && d < n {
            #[expect(clippy::cast_possible_truncation, reason = "edge weights are small")]
            let wf = f64::from(w) as f32;
            out_edges[s].push((d, wf));
            out_weight[s] += wf;
        }
    }

    // Personalization vector (Haveliwala 2002, topic-sensitive PageRank).
    //
    // When a focus file is specified, the teleportation distribution is split:
    //   - PERSONALIZATION_ALPHA (0.15) concentrated on the focus node.
    //   - (1 - PERSONALIZATION_ALPHA) = 0.85 spread uniformly over the
    //     remaining (n - 1) other nodes.
    //
    // This gives the focus file a gentle bias over its neighbors without
    // collapsing every other file to an equal uniform floor. The resulting
    // ranks still vary across the corpus, so the caller sees a *neighborhood*
    // of semantically related files rebiased toward the focus (I#16 fix).
    //
    // For n == 1 there are no other nodes; the focus gets all mass (= 1.0).
    let bias: Vec<f32> = if let Some(idx) = focus {
        if n == 1 {
            vec![1.0_f32]
        } else {
            let other_mass = (1.0_f32 - PERSONALIZATION_ALPHA) / (n as f32 - 1.0);
            let mut b = vec![other_mass; n];
            if idx < n {
                b[idx] = PERSONALIZATION_ALPHA;
            }
            // Verify sum ≈ 1.0 (should hold by construction; normalization
            // guards against floating-point drift on very large graphs).
            let sum: f32 = b.iter().sum();
            for v in &mut b {
                *v /= sum;
            }
            b
        }
    } else {
        vec![1.0 / n as f32; n]
    };

    let mut rank = vec![1.0 / n as f32; n];
    let mut next_rank = vec![0.0_f32; n];

    for _ in 0..MAX_ITERATIONS {
        // Collect dangling mass (nodes with no outgoing edges)
        let dangling: f32 = rank
            .iter()
            .enumerate()
            .filter(|&(i, _)| out_edges[i].is_empty())
            .map(|(_, &r)| r)
            .sum();

        // Distribute rank
        for (i, nr) in next_rank.iter_mut().enumerate() {
            *nr = (1.0 - DAMPING).mul_add(bias[i], DAMPING * dangling * bias[i]);
        }

        for (src, edges_list) in out_edges.iter().enumerate() {
            if edges_list.is_empty() {
                continue;
            }
            let src_rank = rank[src];
            let total_w = out_weight[src];
            for &(dst, w) in edges_list {
                next_rank[dst] += DAMPING * src_rank * (w / total_w);
            }
        }

        // Check convergence
        let diff: f32 = rank
            .iter()
            .zip(next_rank.iter())
            .map(|(a, b)| (a - b).abs())
            .sum();

        std::mem::swap(&mut rank, &mut next_rank);

        if diff < EPSILON {
            break;
        }
    }

    rank
}

// ── Graph Building ───────────────────────────────────────────────────

/// Intermediate result from definition-level graph computation.
struct DefGraphData {
    def_edges: Vec<(DefId, DefId, u32)>,
    def_ranks: Vec<f32>,
    def_callers: Vec<Vec<DefId>>,
    def_callees: Vec<Vec<DefId>>,
    offsets: Vec<usize>,
    base_ranks: Vec<f32>,
    file_edges: Vec<(u32, u32, u32)>,
}

/// Build bidirectional trait↔impl method edges for PageRank propagation (G3).
///
/// For every impl method that overrides a trait method, adds:
/// - `(impl_def_id, trait_def_id, 1)` — impl → trait
/// - `(trait_def_id, impl_def_id, 1)` — trait → impl
///
/// Detection heuristic: an impl method "overrides" a trait method when:
/// - The impl method's kind is `"function_item"` and its `scope` starts with
///   `"impl_item"`.
/// - The trait method's kind is `"function_signature_item"` and its `scope`
///   starts with `"trait_item"`.
/// - Both have the same `name`.
/// - The impl's file imports the trait's file (or they share a file).
///
/// This is heuristic, not sound: it may produce false positives when two
/// unrelated traits define methods with the same name. The practical false-
/// positive rate on real Rust codebases is low because method names are
/// usually unique within a crate.
#[must_use]
pub fn build_trait_impl_edges_pub(files: &[FileNode]) -> Vec<(DefId, DefId, u32)> {
    build_trait_impl_edges(files)
}

fn build_trait_impl_edges(files: &[FileNode]) -> Vec<(DefId, DefId, u32)> {
    // Build index: method_name → list of (DefId, is_trait_method).
    // trait method: kind == "function_signature_item" (abstract) OR scope contains "trait_item".
    // impl method:  kind == "function_item" AND scope contains "impl_item".
    let mut trait_methods: HashMap<String, Vec<DefId>> = HashMap::new();
    let mut impl_methods: HashMap<String, Vec<DefId>> = HashMap::new();

    for (fi, file) in files.iter().enumerate() {
        for (di, def) in file.defs.iter().enumerate() {
            #[expect(clippy::cast_possible_truncation)]
            let did: DefId = (fi as u32, di as u16);
            if def.kind == "function_signature_item"
                || (def.scope.starts_with("trait_item") && def.kind == "function_item")
            {
                trait_methods.entry(def.name.clone()).or_default().push(did);
            } else if def.kind == "function_item" && def.scope.starts_with("impl_item") {
                impl_methods.entry(def.name.clone()).or_default().push(did);
            }
        }
    }

    // Pre-build imported-files sets to restrict matching.
    let imported_sets: Vec<std::collections::HashSet<u32>> = files
        .iter()
        .map(|f| {
            f.imports
                .iter()
                .filter_map(|imp| imp.resolved_idx)
                .collect()
        })
        .collect();

    let mut edges: Vec<(DefId, DefId, u32)> = Vec::new();

    for (name, trait_defs) in &trait_methods {
        let Some(impl_defs) = impl_methods.get(name) else {
            continue;
        };
        for &(tf, td) in trait_defs {
            for &(imf, imd) in impl_defs {
                // The impl file must import the trait file (or be the same file).
                let connected = tf == imf
                    || imported_sets
                        .get(imf as usize)
                        .is_some_and(|s| s.contains(&tf));
                if connected {
                    let trait_id: DefId = (tf, td);
                    let impl_id: DefId = (imf, imd);
                    edges.push((trait_id, impl_id, 1));
                    edges.push((impl_id, trait_id, 1));
                }
            }
        }
    }

    edges
}

/// Build definition-level edges, compute `PageRank`, and derive file-level data.
fn compute_def_graph(files: &[FileNode]) -> DefGraphData {
    // Build definition-level edge list from resolved calls
    let mut def_edge_map: HashMap<(DefId, DefId), u32> = HashMap::new();
    for (file_idx, file) in files.iter().enumerate() {
        for (def_idx, def) in file.defs.iter().enumerate() {
            #[expect(clippy::cast_possible_truncation)]
            let caller_id: DefId = (file_idx as u32, def_idx as u16);
            for call in &def.calls {
                if let Some(callee_id) = call.resolved {
                    *def_edge_map.entry((caller_id, callee_id)).or_insert(0) += 1;
                }
            }
        }
    }

    // Add trait↔impl bidirectional edges (G3).
    let trait_impl_edges = build_trait_impl_edges(files);
    for (src, dst, w) in trait_impl_edges {
        *def_edge_map.entry((src, dst)).or_insert(0) += w;
    }

    let def_edges: Vec<(DefId, DefId, u32)> = def_edge_map
        .into_iter()
        .map(|((src, dst), w)| (src, dst, w))
        .collect();

    // Compute def-level PageRank
    let offsets = def_offsets(files);
    let n_defs = *offsets.last().unwrap_or(&0);

    let flat_def_edges: Vec<(u32, u32, u32)> = def_edges
        .iter()
        .map(|(src, dst, w)| {
            #[expect(clippy::cast_possible_truncation)]
            (
                flatten_def_id(&offsets, *src) as u32,
                flatten_def_id(&offsets, *dst) as u32,
                *w,
            )
        })
        .collect();

    let def_ranks = pagerank(n_defs, &flat_def_edges, None);

    // Aggregate def ranks to file level
    let base_ranks: Vec<f32> = files
        .iter()
        .enumerate()
        .map(|(i, file)| {
            let start = offsets[i];
            let end = start + file.defs.len();
            def_ranks[start..end].iter().sum()
        })
        .collect();

    // Derive file-level edges from def-level call edges
    let mut file_edge_map: HashMap<(u32, u32), u32> = HashMap::new();
    for &(src, dst, w) in &def_edges {
        let src_file = src.0;
        let dst_file = dst.0;
        if src_file != dst_file {
            *file_edge_map.entry((src_file, dst_file)).or_insert(0) += w;
        }
    }
    let file_edges: Vec<(u32, u32, u32)> = file_edge_map
        .into_iter()
        .map(|((src, dst), w)| (src, dst, w))
        .collect();

    // Build def-level caller/callee lists
    let (def_callers, def_callees) = build_def_neighbor_lists(n_defs, &flat_def_edges, &offsets);

    DefGraphData {
        def_edges,
        def_ranks,
        def_callers,
        def_callees,
        offsets,
        base_ranks,
        file_edges,
    }
}

/// Build a dependency graph from a repository root.
///
/// Walks the directory tree, parses each supported file with tree-sitter,
/// extracts definitions and imports, resolves import paths to files, runs
/// `PageRank`, and builds caller/callee lists.
///
/// # Errors
///
/// Returns an error if file walking or reading fails.
#[expect(
    clippy::too_many_lines,
    reason = "three-phase parallel pipeline (walk+filter, def+import extraction, call extraction) \
              plus resolve + graph build; phases share state (file_index, raw_sources) and \
              cannot be meaningfully split without passing large mutable structures across \
              boundaries with no clarity gain"
)]
pub fn build_graph(root: &Path) -> crate::Result<RepoGraph> {
    let root = root.canonicalize().map_err(|e| crate::Error::Io {
        path: root.display().to_string(),
        source: e,
    })?;

    let mut walk_options = walk::WalkOptions::default();
    if let Some((_, config)) = crate::cache::config::find_config(&root) {
        walk_options.ignore_patterns = config.ignore.patterns;
    }
    let all_files = walk::collect_files_with_options(&root, &walk_options);

    // Phase 1: parallel filter + read. For each candidate path with a
    // supported extension, read its source from disk and emit a tuple
    // alongside its relative path. rayon spreads the I/O cost across
    // worker threads; on a 1M-file corpus this was ~20s sequential and
    // now sits in the 2-3s range bounded by disk + filter throughput.
    let raw_inputs: Vec<(PathBuf, String, String, String)> = all_files
        .par_iter()
        .filter_map(|path| {
            let ext = path
                .extension()
                .and_then(|e| e.to_str())
                .unwrap_or_default()
                .to_string();
            if languages::config_for_extension(&ext).is_none()
                && import_query_for_extension(&ext).is_none()
            {
                return None;
            }
            let source = std::fs::read_to_string(path).ok()?;
            let rel_path = path
                .strip_prefix(&root)
                .unwrap_or(path)
                .display()
                .to_string();
            Some((path.clone(), rel_path, ext, source))
        })
        .collect();

    // Build the contiguous `files` Vec and the absolute-path -> idx
    // lookup. Sequential because both want stable indices that match
    // `raw_sources`'s order; the per-file work this gates is trivial.
    let mut file_index: HashMap<PathBuf, usize> = HashMap::with_capacity(raw_inputs.len());
    let mut files: Vec<FileNode> = Vec::with_capacity(raw_inputs.len());
    let mut raw_sources: Vec<(usize, String, String)> = Vec::with_capacity(raw_inputs.len());
    for (idx, (abs_path, rel_path, ext, source)) in raw_inputs.into_iter().enumerate() {
        file_index.insert(abs_path, idx);
        files.push(FileNode {
            path: rel_path,
            defs: vec![],
            imports: vec![],
        });
        raw_sources.push((idx, ext, source));
    }

    // Phase 2: parallel per-file definition + import extraction. Each
    // file's tree-sitter parse + def/import queries are independent;
    // par_iter_mut over files.iter_mut().zip(raw_sources.par_iter())
    // lets every rayon worker grind its own slice. The closures here
    // borrow `&root` and `&file_index` immutably (both Sync) and write
    // disjoint `FileNode` slots via the &mut iterator.
    files
        .par_iter_mut()
        .zip(raw_sources.par_iter())
        .for_each(|(file, (_, ext, source))| {
            if let Some(config) = languages::config_for_extension(ext) {
                file.defs = extract_definitions(source, &config);
                // Go method_declaration scopes are empty after the generic
                // extract_definitions pass (the method is a top-level node
                // with no structural parent in CONTAINER_KINDS). Enrich them
                // with the receiver type so that resolve_calls Priority 2
                // (scope.contains(recv_type)) fires correctly for cross-file
                // Go receiver-method calls. This populates def_callers[] for
                // Go in compute_def_graph (P1 fix).
                if languages::is_go_language(&config.language) {
                    enrich_go_method_def_scopes(source, &mut file.defs);
                }
                // SQL: prepend a synthetic file-level def named after the
                // filename stem (dbt/sqlmesh convention). The whole-file
                // byte range becomes the smallest-enclosing fallback for
                // FROM/JOIN call-edges that are not inside any CTE, which
                // is the resolution target for cross-model references
                // (S1, Wave 4). file.path is relative to the repo root and
                // is what file_stem() needs to derive the model name.
                if languages::is_sql_language(&config.language) {
                    enrich_sql_file_def(&file.path, source, &mut file.defs);
                }
            }
            if let Some((lang, import_query)) = import_query_for_extension(ext) {
                let raw_imports = extract_imports(source, &lang, &import_query);
                let file_path = root.join(&file.path);
                file.imports = raw_imports
                    .into_iter()
                    .map(|raw| {
                        let resolved_idx =
                            resolve_import(&raw, ext, &file_path, &root, &file_index)
                                .and_then(|i| u32::try_from(i).ok());
                        ImportRef {
                            raw_path: raw,
                            resolved_idx,
                        }
                    })
                    .collect();
            }
        });

    // Phase 3: parallel per-file call extraction. Mutates each
    // FileNode's `defs[*].calls` independently. Aligned with
    // raw_sources by index via the zip.
    files
        .par_iter_mut()
        .zip(raw_sources.par_iter())
        .for_each(|(file, (_, ext, source))| {
            if let Some(call_config) = languages::call_query_for_extension(ext) {
                extract_calls(source, &call_config, &mut file.defs);
            }
        });

    // Build the Python class hierarchy (class_name → parent class names) by
    // walking every Python source file. The map is used by `resolve_calls`
    // Priority 2.5 to dispatch `self.method()` calls through the MRO when
    // the method lives on a parent / mixin class (Q1, Wave 2).
    //
    // Parallel: extract_python_class_hierarchy is pure per-file, then we
    // fold the per-file maps into one global map sequentially because
    // HashMap is not lock-free. On a 1k-Python-file corpus this fold takes
    // <10ms — much smaller than the parallel parse work that feeds it.
    let python_hierarchies: Vec<HashMap<String, Vec<String>>> = raw_sources
        .par_iter()
        .map(|(_, ext, source)| {
            if ext == "py" || ext == "pyi" {
                extract_python_class_hierarchy(source)
            } else {
                HashMap::new()
            }
        })
        .collect();
    let mut python_class_hierarchy: HashMap<String, Vec<String>> = HashMap::new();
    for local in python_hierarchies {
        for (k, v) in local {
            // First declaration wins on name collisions across files. The
            // MRO walk only needs a plausible parent chain to find an
            // ancestor's methods; this is conservative but acceptable.
            python_class_hierarchy.entry(k).or_insert(v);
        }
    }

    // Resolve call references to target definitions
    let def_index = build_def_index(&files);
    resolve_calls(&mut files, &def_index, &python_class_hierarchy);

    // Build def-level graph, compute PageRank, and derive file-level data
    let graph_data = compute_def_graph(&files);

    // Build file-level caller/callee lists
    let n = files.len();
    let (callers, callees) = build_neighbor_lists(n, &graph_data.file_edges);

    // Auto-tune alpha based on graph density
    #[expect(clippy::cast_precision_loss, reason = "graph sizes fit in f32")]
    let density = if n > 1 {
        graph_data.file_edges.len() as f32 / (n as f32 * (n as f32 - 1.0))
    } else {
        0.0
    };
    let alpha = 0.3f32.mul_add(density.min(1.0), 0.5);

    Ok(RepoGraph {
        files,
        edges: graph_data.file_edges,
        base_ranks: graph_data.base_ranks,
        callers,
        callees,
        def_edges: graph_data.def_edges,
        def_ranks: graph_data.def_ranks,
        def_callers: graph_data.def_callers,
        def_callees: graph_data.def_callees,
        def_offsets: graph_data.offsets,
        alpha,
    })
}

/// Build a `RepoGraph` directly from a pre-constructed `Vec<FileNode>`.
///
/// Skips the filesystem walk phase of [`build_graph`]; useful for integration
/// tests that want to build synthetic graphs without touching disk.
///
/// Resolves calls, builds the def-level graph (including G3 trait↔impl edges),
/// computes `PageRank`, and builds caller/callee lists.
#[must_use]
pub fn build_graph_from_files_pub(mut files: Vec<FileNode>) -> RepoGraph {
    let def_index = build_def_index(&files);
    // No source is available at this entry point, so the Python class
    // hierarchy is empty and the MRO walk (Priority 2.5) is a no-op.
    // Tests that need MRO resolution should drive `resolve_calls_with_python_mro_pub`
    // directly and then call `build_graph_from_files_pub` for the rest of the pipeline.
    let empty_hierarchy: HashMap<String, Vec<String>> = HashMap::new();
    resolve_calls(&mut files, &def_index, &empty_hierarchy);
    let graph_data = compute_def_graph(&files);
    let n = files.len();
    let (callers, callees) = build_neighbor_lists(n, &graph_data.file_edges);

    #[expect(clippy::cast_precision_loss, reason = "graph sizes fit in f32")]
    let density = if n > 1 {
        graph_data.file_edges.len() as f32 / (n as f32 * (n as f32 - 1.0))
    } else {
        0.0
    };
    let alpha = 0.3f32.mul_add(density.min(1.0), 0.5);

    RepoGraph {
        files,
        edges: graph_data.file_edges,
        base_ranks: graph_data.base_ranks,
        callers,
        callees,
        def_edges: graph_data.def_edges,
        def_ranks: graph_data.def_ranks,
        def_callers: graph_data.def_callers,
        def_callees: graph_data.def_callees,
        def_offsets: graph_data.offsets,
        alpha,
    }
}

impl RepoGraph {
    /// Get the `PageRank` score for a specific definition.
    #[must_use]
    pub fn def_rank(&self, did: DefId) -> f32 {
        let flat = self.def_offsets[did.0 as usize] + did.1 as usize;
        self.def_ranks.get(flat).copied().unwrap_or(0.0)
    }

    /// Look up a definition by file path and name. Returns the first match.
    #[must_use]
    pub fn find_def(&self, file_path: &str, def_name: &str) -> Option<DefId> {
        for (file_idx, file) in self.files.iter().enumerate() {
            if file.path == file_path {
                for (def_idx, def) in file.defs.iter().enumerate() {
                    if def.name == def_name {
                        #[expect(clippy::cast_possible_truncation)]
                        return Some((file_idx as u32, def_idx as u16));
                    }
                }
            }
        }
        None
    }

    /// Resolve a caller-supplied `focus_file` string to a file index in [`Self::files`].
    ///
    /// Accepts any of the path forms that ripvec itself emits or accepts:
    ///
    /// - **Exact stored path** (`device_opt/services/storage.py`) — direct match.
    /// - **LSP-shaped path** (`./device_opt/services/storage.py`) — the `./`
    ///   prefix used by every [`RepoMapLspLocation::file_path`] is stripped
    ///   before comparison so the documented chaining pattern
    ///   `get_repo_map(focus_file=hits[0].lsp_location.file_path)` works.
    /// - **Strict suffix** (`storage.py`, `services/storage.py`) — match when
    ///   the previous character in the stored path is `/`. Avoids matching
    ///   `foo_storage.py` for `storage.py`.
    ///
    /// Returns [`FocusResolution::Found`] when exactly one file matches,
    /// [`FocusResolution::Ambiguous`] when multiple files match (the caller
    /// surfaces the candidate list to the user), and [`FocusResolution::NotFound`]
    /// when no file matches.
    ///
    /// # Background
    ///
    /// Prior to this helper the MCP layer (`crates/ripvec-mcp/src/tools.rs`)
    /// did the matching inline with two bugs:
    ///
    /// 1. **`./` prefix mismatch.** [`RepoMapLspLocation::file_path`] always
    ///    carries a leading `./` (see [`file_lsp_location`]), but
    ///    [`FileNode::path`] does not. Passing the LSP location verbatim as
    ///    `focus_file` matched zero files. The matcher silently returned
    ///    `focus = None`, producing rank values bit-identical to the unfocused
    ///    call — the bug originally reported as "I#20 focus_file rebias
    ///    invisible on Python".
    /// 2. **Equal-length false negative.** When the user passed
    ///    `./device_opt/services/storage.py` and the stored path was
    ///    `device_opt/services/storage.py`, `exact` was false (the strings
    ///    differ by two bytes) and `strict_suffix` was false (the focus is
    ///    longer than the stored path, so `p.len() > focus.len()` fails). The
    ///    pathology surfaced specifically when the focus was a *full* path
    ///    with the LSP `./` prefix.
    ///
    /// Centralising the resolution here gives every caller the same
    /// normalization-tolerant semantics and one place to test the contract.
    #[must_use]
    pub fn resolve_focus_file(&self, focus: &str) -> FocusResolution {
        let normalized = normalize_focus_path(focus);
        let matches: Vec<usize> = self
            .files
            .iter()
            .enumerate()
            .filter_map(|(idx, f)| {
                if focus_matches_path(&f.path, normalized) {
                    Some(idx)
                } else {
                    None
                }
            })
            .collect();
        match matches.len() {
            0 => FocusResolution::NotFound,
            1 => FocusResolution::Found(matches[0]),
            _ => FocusResolution::Ambiguous(
                matches
                    .into_iter()
                    .map(|i| self.files[i].path.clone())
                    .collect(),
            ),
        }
    }
}

/// Result of resolving a user-supplied `focus_file` string against a [`RepoGraph`].
///
/// See [`RepoGraph::resolve_focus_file`] for the resolution semantics and the
/// historical bug that motivated the helper.
#[derive(Debug, Clone)]
pub enum FocusResolution {
    /// Exactly one file matched. Carries the file index in [`RepoGraph::files`].
    Found(usize),
    /// No file matched. The caller treats this as an unfocused call.
    NotFound,
    /// Two or more files matched. The caller surfaces the candidate list so
    /// the user can disambiguate by passing a longer suffix or the full path.
    Ambiguous(Vec<String>),
}

/// Strip the leading `./` prefix from a focus_file path.
///
/// The `./` form is produced by [`file_lsp_location`] for every
/// [`RepoMapLspLocation::file_path`] field on a relative path. Stripping it
/// gives a stored-path-shaped value for the suffix matcher to compare
/// against [`FileNode::path`] entries (which do not carry the prefix).
///
/// Absolute paths (`/abs/path/file.py`) are returned unchanged; they will
/// fail the suffix match against the relative stored paths, which is the
/// correct behavior (the caller meant a different root entirely).
fn normalize_focus_path(focus: &str) -> &str {
    focus.strip_prefix("./").unwrap_or(focus)
}

/// Return true when `focus` matches `stored_path` as either an exact path or
/// a strict-suffix (must be preceded by `/`). The empty focus does not match.
fn focus_matches_path(stored_path: &str, focus: &str) -> bool {
    if focus.is_empty() {
        return false;
    }
    if stored_path == focus {
        return true;
    }
    stored_path.len() > focus.len()
        && stored_path.ends_with(focus)
        && stored_path.as_bytes()[stored_path.len() - focus.len() - 1] == b'/'
}

/// Build top-N caller and callee lists for each file.
///
/// Given a list of weighted directed edges `(src, dst, weight)` over `n`
/// nodes, returns `(callers[i], callees[i])` for each node `i`, where each
/// list contains the top-[`MAX_NEIGHBORS`] adjacent nodes sorted by descending
/// edge weight.
///
/// Exposed as `pub` so that integration tests can construct synthetic
/// [`RepoGraph`] instances for unit-testing the JSON rendering without going
/// through a full disk walk.
#[must_use]
pub fn build_neighbor_lists(n: usize, edges: &[(u32, u32, u32)]) -> (Vec<Vec<u32>>, Vec<Vec<u32>>) {
    let mut incoming: Vec<Vec<(u32, u32)>> = vec![vec![]; n];
    let mut outgoing: Vec<Vec<(u32, u32)>> = vec![vec![]; n];

    for &(src, dst, w) in edges {
        let (s, d) = (src as usize, dst as usize);
        if s < n && d < n {
            incoming[d].push((src, w));
            outgoing[s].push((dst, w));
        }
    }

    // Sort by weight descending, keep top N
    let trim = |lists: &mut [Vec<(u32, u32)>]| -> Vec<Vec<u32>> {
        lists
            .iter_mut()
            .map(|list| {
                list.sort_by_key(|b| std::cmp::Reverse(b.1));
                list.iter()
                    .take(MAX_NEIGHBORS)
                    .map(|(idx, _)| *idx)
                    .collect()
            })
            .collect()
    };

    (trim(&mut incoming), trim(&mut outgoing))
}

// ── Rendering ────────────────────────────────────────────────────────

/// Render a budget-constrained overview of the repository.
///
/// Files are sorted by `PageRank` (or topic-sensitive rank if `focus` is
/// `Some`). Output uses four tiers of decreasing detail:
///
/// - **Tier 0** (top 10%): full path, rank, callers/callees, signatures with scopes
/// - **Tier 1** (next 20%): full path, rank, signatures
/// - **Tier 2** (next 40%): full path, rank, definition names and kinds
/// - **Tier 3** (bottom 30%): file path only
///
/// Stops accumulating output when the estimated token count exceeds
/// `max_tokens`.
#[must_use]
pub fn render(graph: &RepoGraph, max_tokens: usize, focus: Option<usize>) -> String {
    let n = graph.files.len();
    if n == 0 {
        return String::new();
    }

    // Compute ranks (recompute topic-sensitive if focus is given)
    let ranks = if focus.is_some() {
        pagerank(n, &graph.edges, focus)
    } else {
        graph.base_ranks.clone()
    };

    // Sort file indices by rank descending
    let mut sorted: Vec<usize> = (0..n).collect();
    sorted.sort_by(|&a, &b| ranks[b].total_cmp(&ranks[a]));

    let mut output = String::new();
    let mut used_tokens = 0;
    let max_chars = max_tokens * CHARS_PER_TOKEN;

    for (rank_pos, &file_idx) in sorted.iter().enumerate() {
        if used_tokens >= max_tokens {
            break;
        }

        let file = &graph.files[file_idx];
        let score = ranks[file_idx];
        #[expect(clippy::cast_precision_loss, reason = "file counts fit in f32")]
        let percentile = (rank_pos as f32) / (n as f32);

        let section = if percentile < 0.1 {
            render_tier0(graph, file_idx, file, score)
        } else if percentile < 0.3 {
            render_tier1(file, score)
        } else if percentile < 0.7 {
            render_tier2(file, score)
        } else {
            render_tier3(file)
        };

        let section_chars = section.len();
        if used_tokens > 0 && used_tokens + section_chars / CHARS_PER_TOKEN > max_tokens {
            // Would exceed budget — try to fit at least the path
            let path_line = format!("{}\n", file.path);
            let path_tokens = path_line.len() / CHARS_PER_TOKEN;
            if used_tokens + path_tokens <= max_tokens {
                output.push_str(&path_line);
            }
            break;
        }

        output.push_str(&section);
        used_tokens = output.len().min(max_chars) / CHARS_PER_TOKEN;
    }

    output
}

/// Render tier 0: full detail with callers, callees, and signatures.
fn render_tier0(graph: &RepoGraph, file_idx: usize, file: &FileNode, score: f32) -> String {
    let mut out = format!("## {} (rank: {score:.4})\n", file.path);

    // Callers
    if file_idx < graph.callers.len() && !graph.callers[file_idx].is_empty() {
        let _ = write!(out, "  called by: ");
        let names: Vec<&str> = graph.callers[file_idx]
            .iter()
            .filter_map(|&idx| graph.files.get(idx as usize).map(|f| f.path.as_str()))
            .collect();
        let _ = writeln!(out, "{}", names.join(", "));
    }

    // Callees
    if file_idx < graph.callees.len() && !graph.callees[file_idx].is_empty() {
        let _ = write!(out, "  calls: ");
        let names: Vec<&str> = graph.callees[file_idx]
            .iter()
            .filter_map(|&idx| graph.files.get(idx as usize).map(|f| f.path.as_str()))
            .collect();
        let _ = writeln!(out, "{}", names.join(", "));
    }

    // Definitions with scope and signature
    for def in &file.defs {
        let scope_prefix = if def.scope.is_empty() {
            String::new()
        } else {
            format!("{} > ", def.scope)
        };
        if let Some(sig) = &def.signature {
            let _ = writeln!(out, "  {scope_prefix}{} {sig}", def.kind);
        } else {
            let _ = writeln!(out, "  {scope_prefix}{} {}", def.kind, def.name);
        }
    }
    let _ = writeln!(out);
    out
}

/// Render tier 1: file path, rank, and signatures.
fn render_tier1(file: &FileNode, score: f32) -> String {
    let mut out = format!("## {} (rank: {score:.4})\n", file.path);
    for def in &file.defs {
        if let Some(sig) = &def.signature {
            let _ = writeln!(out, "  {sig}");
        } else {
            let _ = writeln!(out, "  {} {}", def.kind, def.name);
        }
    }
    let _ = writeln!(out);
    out
}

/// Render tier 2: file path, rank, and definition names/kinds.
fn render_tier2(file: &FileNode, score: f32) -> String {
    let mut out = format!("{} (rank: {score:.4})", file.path);
    if !file.defs.is_empty() {
        let names: Vec<String> = file
            .defs
            .iter()
            .map(|d| format!("{}:{}", d.kind, d.name))
            .collect();
        let _ = write!(out, " -- {}", names.join(", "));
    }
    let _ = writeln!(out);
    out
}

/// Render tier 3: file path only.
fn render_tier3(file: &FileNode) -> String {
    format!("{}\n", file.path)
}

// ── JSON rendering ───────────────────────────────────────────────────

/// Build the `lsp_location` for a file itself (line 0).
fn file_lsp_location(path: &str) -> RepoMapLspLocation {
    RepoMapLspLocation {
        file_path: if path.starts_with("./") || path.starts_with('/') {
            path.to_string()
        } else {
            format!("./{path}")
        },
        start_line: 0,
        start_character: 0,
        end_line: 0,
        end_character: 0,
    }
}

/// Infer `ContentKind` from a file path's extension.
fn content_kind_for_path(path: &str) -> ContentKind {
    let ext = std::path::Path::new(path)
        .extension()
        .and_then(|e| e.to_str())
        .unwrap_or("");
    ContentKind::from_extension(ext)
}

/// Minimum byte envelope reserved for each included file.
///
/// Even a file with zero symbols takes JSON overhead for path, rank, arrays,
/// etc. Calibrated against actual serde_json output for an empty `RepoMapFile`:
/// `{"lsp_location":{"file_path":"./src/file_N.rs","start_line":0,"start_character":0,`
/// `"end_line":0,"end_character":0},"rank":0.1234,"content_kind":"code",`
/// `"calls":[],"symbols":[],"truncated_symbols":0,"truncated_calls":0}` ≈ 250 bytes.
///
/// This floor prevents the budget allocator from giving a file so little space
/// that it can emit no envelope at all.
const FILE_ENVELOPE_MIN_BYTES: usize = 250;

/// Minimum useful payload for an admitted file: envelope plus room for at
/// least 2-3 typical-sized symbols. Files whose fair share cannot meet this
/// floor are excluded entirely (Fix A, 4.0.2). Without this guard, low-rank
/// tail files consume budget on envelopes that contain no symbols or calls,
/// crowding out content for the top files.
const FILE_MIN_USEFUL_BYTES: usize = 600;

/// Fraction of each file's per-file budget reserved for outgoing-call edges
/// after the envelope is paid. The remaining (1 - this) fraction goes to
/// symbols. Symbol leftover flows into calls; call leftover flows to the
/// next file. (Fix C, 4.0.2 — without a reserve, the symbol loop saturates
/// the per-file budget and calls always come up empty.)
const CALLS_BUDGET_FRACTION: f64 = 0.30;

/// Maximum fraction of the total budget that a single file may claim.
///
/// Without this cap a single very-high-rank file (e.g. `lib.rs`) could
/// consume the entire budget, leaving all other files empty.
const MAX_FILE_SHARE: f64 = 0.40;

/// AST kind priority for orientation-style symbol ordering. Higher = surface
/// earlier. Used when def-level PageRank is degenerate (most ranks near zero)
/// to fall back on structural signal rather than noise.
///
/// The intuition: a reader orienting in a codebase wants to see the file's
/// *shape* before its *behaviors*. Types declare shape; functions declare
/// behavior; fields and constants are internal detail. This ordering matches
/// how humans read code top-down. (Fix B, 4.0.2.)
fn ast_kind_priority(kind: &str) -> u32 {
    match kind {
        // Tier 3: shape — what THIS file is
        "trait_item" | "interface" | "trait" => 30,
        "struct_item" | "struct" | "class_definition" | "class" => 29,
        "enum_item" | "enum" => 28,
        "type_item" | "type_alias_declaration" | "type_alias" => 27,
        "mod_item" | "module" | "namespace" => 26,
        // Tier 2: behavior — what THIS file does
        "function_item" | "function_definition" | "function" | "method_definition" => 20,
        "impl_item" | "impl" => 19,
        // Tier 1: declarations
        "const_item" | "const_declaration" | "const" => 10,
        "static_item" | "static" => 9,
        // Tier 0: internals (fields, variables, parameters)
        _ => 0,
    }
}

/// Effective AST priority with corpus-relative rank promotion (4.0.4).
///
/// Preserves the 4.0.2 AST-priority ordering by default (types first,
/// then functions, then fields). When a def's PageRank significantly
/// exceeds the corpus median, promotes it up one or two tiers so that
/// load-bearing defs surface alongside their declared-tier neighbors.
///
/// Thresholds are corpus-median multiples (self-calibrating):
/// - rank > 4× median       → +1 tier (e.g., hot function joins type tier)
/// - rank > 16× median      → +2 tiers (extremely hot def)
/// - otherwise              → declared tier preserved
///
/// On degenerate (flat) rank distributions the median equals the floor,
/// nothing crosses threshold, and 4.0.2 AST-priority ordering is fully
/// preserved. On informative distributions (post-4.0.3 enrichment),
/// hot defs surface proportionally.
fn effective_priority(kind: &str, def_rank: f32, promo_1: f32, promo_2: f32) -> u32 {
    let base = ast_kind_priority(kind);
    // Accumulate promotion tiers as a plain integer to satisfy clippy's
    // bool_to_int_with_if lint while preserving branch clarity.
    let promo_tiers: u32 = u32::from(def_rank > promo_1) + u32::from(def_rank > promo_2);
    // Tier spacing matches ast_kind_priority's 10-unit gaps.
    base + promo_tiers * 10
}

/// Estimate the serialised JSON byte cost of one `RepoMapSymbol`.
///
/// Calibrated against actual serde_json output. A `RepoMapSymbol` serialises to
/// approximately:
/// `{"name":"<N>","kind":<K>,"lsp_location":{"file_path":"<P>","start_line":0,`
/// `"start_character":0,"end_line":0,"end_character":0},"rank":<R>}`
///
/// That is ~165 bytes of overhead (braces, keys, fixed-width integers, rank)
/// plus the name length and file_path length. We pass the path length
/// separately because the path is the same for all symbols in one file.
fn estimate_symbol_bytes(name: &str) -> usize {
    // 165 bytes overhead + name length.
    // The file_path is not included here because it is part of the
    // envelope cost accounted separately.
    165 + name.len()
}

/// Estimate the serialised JSON byte cost of one `RepoMapCall`.
///
/// Each call entry: `{"lsp_location":{"file_path":"<P>","start_line":0,`
/// `"start_character":0,"end_line":0,"end_character":0},"rank":<R>}`
/// ≈ 120 bytes overhead + path length.
fn estimate_call_bytes(target_path: &str) -> usize {
    120 + target_path.len()
}

/// Render a `PageRank`-weighted JSON map with token-budget allocation (4.0.1).
///
/// # Algorithm
///
/// **Step 1 — File-share allocation.** Each eligible file receives a byte
/// budget proportional to its `base_rank`. The share is capped at 40% of
/// `budget_total_bytes` and floored at [`FILE_ENVELOPE_MIN_BYTES`] (200 B).
/// Files are included in rank order until the cumulative allocation would
/// exceed the total budget.
///
/// **Step 2 — Per-file symbol fill.** For each included file, symbols are
/// walked in def-rank descending order. Inclusion continues until either (a)
/// the file's budget share is exhausted (with carry-over of leftover bytes to
/// the next file) or (b) a logarithmic attenuation cutoff fires: symbol at
/// position `i` (0-based) is included only if its rank ≥ `top_rank /
/// (1 + ln(i + 1))`. The same algorithm fills `calls[]` in target-file
/// base-rank order. `truncated_symbols` and `truncated_calls` track the
/// count of omitted entries.
///
/// **Step 3 — Response telemetry.** The response includes `estimated_bytes`
/// (actual returned content size), `budget_bytes` (`token_budget * 4`),
/// and `budget_exhausted` (`total_files > files.len()`).
///
/// # Arguments
///
/// - `graph` — the built dependency graph.
/// - `token_budget` — caller-specified token budget (× 4 = byte budget).
/// - `focus` — optional file index for topic-sensitive `PageRank`.
/// - `include_metadata` — when `false` (default), Meta-classified files
///   are excluded before ranking.
#[must_use]
#[expect(
    clippy::cast_precision_loss,
    reason = "rank sums and counts are small f32/f64; precision loss is acceptable"
)]
#[expect(
    clippy::too_many_lines,
    reason = "the three-step allocation algorithm (file-share → symbol-fill → calls-fill) \
              is sequential and share state; splitting into helpers would require passing \
              mutable slices across three boundaries with no clarity gain"
)]
pub fn render_json_budgeted(
    graph: &RepoGraph,
    token_budget: usize,
    focus: Option<usize>,
    include_metadata: bool,
) -> GetRepoMapResponse {
    let n = graph.files.len();
    if n == 0 {
        let budget_bytes = token_budget * CHARS_PER_TOKEN;
        return GetRepoMapResponse {
            files: vec![],
            total_files: 0,
            estimated_bytes: 0,
            budget_bytes,
            budget_exhausted: false,
            capped: false,
        };
    }

    let budget_total_bytes = token_budget * CHARS_PER_TOKEN;

    // Recompute topic-sensitive ranks if focus is given.
    let ranks = if focus.is_some() {
        pagerank(n, &graph.edges, focus)
    } else {
        graph.base_ranks.clone()
    };

    // Sort all file indices by rank descending.
    let mut sorted: Vec<usize> = (0..n).collect();
    sorted.sort_by(|&a, &b| ranks[b].total_cmp(&ranks[a]));

    // Apply metadata exclusion filter.
    let eligible: Vec<usize> = if include_metadata {
        sorted
    } else {
        sorted
            .into_iter()
            .filter(|&idx| {
                let kind = content_kind_for_path(&graph.files[idx].path);
                kind != ContentKind::Meta
            })
            .collect()
    };

    let total_files = eligible.len();

    // ── Corpus-median def-rank thresholds for tier promotion (4.0.4) ────────
    //
    // Compute once per call (corpus-wide, not per-file) so the threshold is
    // self-calibrating: flat distributions (all ranks equal) set median = floor
    // and nothing crosses threshold; informative distributions see proportional
    // promotion. Using corpus-wide median ensures a hot function in one file is
    // judged against the entire corpus, not just its local file peers.
    // Use the 75th percentile of nonzero def-ranks as the corpus reference value
    // for tier promotion (rather than the 50th percentile / median). The 75th
    // percentile is more robust: on a flat distribution most defs cluster near the
    // floor, so the 75th percentile is only marginally above the floor (making the
    // 4× threshold very selective). On an informative distribution (post-4.0.3
    // call-edge enrichment) the 75th percentile is meaningfully above the floor,
    // so the same 4× multiplier captures genuinely hot defs without falsely
    // promoting slightly-above-floor helpers.
    //
    // The 50th percentile (lower median) was rejected because on a 10-def corpus
    // with max/min ratio 5× the median equals the floor, causing the 4× threshold
    // to fire on defs that are only 5× above floor (a low-variance corpus). The
    // 75th percentile corrects this without requiring hand-tuned per-corpus magic
    // numbers.
    let corpus_reference_rank: f32 = {
        let mut nonzero: Vec<f32> = graph
            .def_ranks
            .iter()
            .copied()
            .filter(|r| *r > 0.0)
            .collect();
        if nonzero.is_empty() {
            0.0
        } else {
            nonzero.sort_unstable_by(f32::total_cmp);
            let n = nonzero.len();
            // 75th percentile index: floor(0.75 * (n - 1))
            let idx = (3 * (n - 1)) / 4;
            nonzero[idx]
        }
    };
    let promo_1_threshold = corpus_reference_rank * 4.0; // +1 tier
    let promo_2_threshold = corpus_reference_rank * 16.0; // +2 tiers

    // ── Step 1: File-share allocation ────────────────────────────────

    // Greedily determine which files fit within the budget, computing each
    // file's share as it is added. We must run a two-pass approach:
    //   pass A: determine which files are included (cumulative sum check),
    //   pass B: fill symbols/calls using final per-file allocations.
    //
    // The "included" decision is based on the running cumulative sum so that
    // the leftover redistribution in step 2 can carry forward correctly.

    // Floor-first admission (Fix A, 4.0.2):
    //
    // Cap admitted file count so each gets at least FILE_MIN_USEFUL_BYTES.
    // Below this threshold the response would carry envelopes that contain
    // no symbols or calls — pure overhead, no information. Concentrating
    // the budget on fewer files with real content is strictly better for
    // orientation than dropping envelope sentinels for many files.
    let max_admissible = budget_total_bytes / FILE_MIN_USEFUL_BYTES;
    let admit_count = eligible.len().min(max_admissible.max(1));

    let budget_f64 = budget_total_bytes as f64;

    // Pre-compute rank sum across ADMITTED files only (top-N by rank). f64
    // to avoid precision loss when summing many small f32 values.
    let admitted_rank_sum: f64 = eligible
        .iter()
        .take(admit_count)
        .map(|&idx| f64::from(ranks[idx]))
        .sum();
    let admitted_rank_sum = if admitted_rank_sum > 0.0 {
        admitted_rank_sum
    } else {
        1.0
    };

    // Compute per-file budgets. Each admitted file gets at least
    // FILE_MIN_USEFUL_BYTES; the proportional-to-rank share is applied on
    // top of the floor and capped at MAX_FILE_SHARE.
    let mut included_indices: Vec<usize> = Vec::new(); // indices into `eligible`
    let mut file_budgets: Vec<usize> = Vec::new();
    let mut cumulative_budget: usize = 0;

    for (i, &file_idx) in eligible.iter().take(admit_count).enumerate() {
        let file_rank = f64::from(ranks[file_idx]);
        let raw_share = budget_f64 * file_rank / admitted_rank_sum;
        let capped = raw_share.min(budget_f64 * MAX_FILE_SHARE);
        // `capped` is non-negative and bounded by budget_f64 (a usize).
        #[expect(
            clippy::cast_possible_truncation,
            clippy::cast_sign_loss,
            reason = "capped is non-negative and bounded by budget_total_bytes (a usize)"
        )]
        let budget_i = (capped as usize).max(FILE_MIN_USEFUL_BYTES);

        if cumulative_budget + budget_i > budget_total_bytes && !included_indices.is_empty() {
            break;
        }
        cumulative_budget += budget_i;
        included_indices.push(i);
        file_budgets.push(budget_i);
    }

    // ── Step 2: Per-file symbol fill ─────────────────────────────────

    let mut result_files: Vec<RepoMapFile> = Vec::with_capacity(included_indices.len());
    let mut leftover: usize = 0; // unused bytes carried from previous file

    for (slot, &eligible_i) in included_indices.iter().enumerate() {
        let file_idx = eligible[eligible_i];
        let file = &graph.files[file_idx];
        let file_rank = ranks[file_idx];
        let file_path_lsp = file_lsp_location(&file.path);

        let budget_in = file_budgets[slot] + leftover;

        // Reserve a fraction of the post-envelope budget for outgoing calls
        // (Fix C, 4.0.2). Without this guard the symbol loop saturates
        // `budget_in` and the calls loop always trips its byte-check.
        // Symbol leftover flows into calls; call leftover flows to the
        // next file via the outer `leftover` variable.
        let post_envelope = budget_in.saturating_sub(FILE_ENVELOPE_MIN_BYTES);
        #[expect(
            clippy::cast_possible_truncation,
            clippy::cast_sign_loss,
            reason = "post_envelope * fraction is bounded by post_envelope (a usize)"
        )]
        let calls_reserve = (post_envelope as f64 * CALLS_BUDGET_FRACTION) as usize;
        let symbols_budget = FILE_ENVELOPE_MIN_BYTES + post_envelope.saturating_sub(calls_reserve);
        let mut used: usize = FILE_ENVELOPE_MIN_BYTES; // envelope cost

        // ── Symbols ──────────────────────────────────────────────────
        // Retrieve def-level ranks for this file via the offset table.
        let def_count = file.defs.len();
        let def_offset = if file_idx < graph.def_offsets.len() {
            graph.def_offsets[file_idx]
        } else {
            0
        };

        // Build (def_idx, rank, kind_priority, start_byte) tuples. We sort
        // by a composite key: AST kind priority (descending) — putting types
        // before functions before fields — then by def_rank (descending)
        // within each tier. This is Fix B (4.0.2): the def_rank distribution
        // is often degenerate (most defs share near-zero rank because the
        // call-edge extractor doesn't capture every dispatch), so we use
        // structural signal as the primary ordering and def_rank as the
        // within-tier tiebreaker. When def_rank IS informative, it dominates
        // *within* its kind tier and recovers the original behavior; the AST
        // signal only shifts ordering *between* tiers.
        let mut def_rank_pairs: Vec<(usize, f32, u32, u32)> = (0..def_count)
            .map(|di| {
                let flat = def_offset + di;
                let r = graph.def_ranks.get(flat).copied().unwrap_or(0.0);
                // Store the ORIGINAL ast_kind_priority in the tuple (used by the
                // per-tier attenuation loop below). The sort comparator uses
                // effective_priority (which may be higher due to 4.0.4 promotion)
                // to reorder hot defs ahead of cold type-tier defs, while the
                // attenuation tier tracker continues to use the original AST tier
                // so the existing per-tier cutoff behaviour is preserved.
                let kind_prio = ast_kind_priority(&file.defs[di].kind);
                let decl_order = file.defs[di].start_byte;
                (di, r, kind_prio, decl_order)
            })
            .collect();
        def_rank_pairs.sort_unstable_by(|a, b| {
            // Primary: effective priority (4.0.4: AST kind + corpus-rank promotion) descending.
            // Hot defs that exceed corpus-median thresholds are promoted above their
            // declared tier so they surface before cold type-tier defs.
            let eff_a = effective_priority(
                &file.defs[a.0].kind,
                a.1,
                promo_1_threshold,
                promo_2_threshold,
            );
            let eff_b = effective_priority(
                &file.defs[b.0].kind,
                b.1,
                promo_1_threshold,
                promo_2_threshold,
            );
            eff_b
                .cmp(&eff_a)
                // Secondary: def_rank descending within tier.
                .then_with(|| b.1.total_cmp(&a.1))
                // Tertiary: earlier declaration order (stable, deterministic).
                .then_with(|| a.3.cmp(&b.3))
        });

        let top_def_rank = def_rank_pairs.first().map(|&(_, r, _, _)| r).unwrap_or(0.0);

        let mut symbols: Vec<RepoMapSymbol> = Vec::new();
        let mut truncated_symbols: usize = 0;

        // Track per-tier position for the attenuation cutoff. When AST kind
        // priority changes (we've moved from types to functions, say), reset
        // the position so the attenuation curve restarts. Otherwise a
        // structurally-equivalent-but-later tier would be unfairly cut.
        let mut tier_pos: usize = 0;
        let mut current_tier: Option<u32> = None;
        let mut tier_top_rank: f32 = top_def_rank;

        for (pos, &(di, def_r, kind_prio, _)) in def_rank_pairs.iter().enumerate() {
            // Reset attenuation at tier boundaries.
            if current_tier != Some(kind_prio) {
                current_tier = Some(kind_prio);
                tier_pos = 0;
                tier_top_rank = def_r;
            }

            // Logarithmic attenuation cutoff, relative to the tier's top rank.
            let cutoff = if tier_top_rank > 0.0 {
                tier_top_rank / (1.0 + (tier_pos as f32 + 1.0).ln())
            } else {
                0.0
            };
            if def_r < cutoff {
                // Attenuation cuts the rest of THIS tier; we don't stop
                // entirely because the next tier may still have useful
                // content within its own attenuation curve. Skip this def.
                truncated_symbols += 1;
                tier_pos += 1;
                continue;
            }

            let def = &file.defs[di];
            let sym_bytes = estimate_symbol_bytes(&def.name);
            // Use the reserved symbols sub-budget (Fix C) so calls aren't
            // starved when symbols would otherwise saturate budget_in.
            if used + sym_bytes > symbols_budget {
                truncated_symbols += def_rank_pairs.len() - pos;
                break;
            }

            let kind = crate::languages::lsp_symbol_kind_for_node_kind(&def.kind);
            let line_0 = def.start_line.saturating_sub(1) as usize;
            symbols.push(RepoMapSymbol {
                name: def.name.clone(),
                kind,
                lsp_location: RepoMapLspLocation {
                    file_path: file_path_lsp.file_path.clone(),
                    start_line: line_0,
                    start_character: 0,
                    end_line: line_0,
                    end_character: 0,
                },
                rank: def_r,
            });
            used += sym_bytes;
            tier_pos += 1;
        }

        // ── Calls ─────────────────────────────────────────────────────
        // Gather outgoing callees sorted by target file base_rank descending.
        let callee_indices: Vec<usize> = if file_idx < graph.callees.len() {
            let mut callees: Vec<(usize, f32)> = graph.callees[file_idx]
                .iter()
                .filter_map(|&ci| {
                    let ci = ci as usize;
                    graph.files.get(ci).map(|_| {
                        let r = graph.base_ranks.get(ci).copied().unwrap_or(0.0);
                        (ci, r)
                    })
                })
                .collect();
            callees.sort_unstable_by(|a, b| b.1.total_cmp(&a.1));
            callees.into_iter().map(|(ci, _)| ci).collect()
        } else {
            vec![]
        };

        let call_total = callee_indices.len();
        let top_call_rank = callee_indices
            .first()
            .and_then(|&ci| graph.base_ranks.get(ci))
            .copied()
            .unwrap_or(0.0);

        let mut calls: Vec<RepoMapCall> = Vec::new();
        let mut truncated_calls: usize = 0;

        for (pos, &ci) in callee_indices.iter().enumerate() {
            let callee_rank = graph.base_ranks.get(ci).copied().unwrap_or(0.0);

            // Logarithmic attenuation cutoff on target rank.
            let cutoff = if top_call_rank > 0.0 {
                top_call_rank / (1.0 + (pos as f32 + 1.0).ln())
            } else {
                0.0
            };
            if callee_rank < cutoff {
                truncated_calls += call_total - pos;
                break;
            }

            let callee_path = &graph.files[ci].path;
            let call_bytes = estimate_call_bytes(callee_path);
            if used + call_bytes > budget_in {
                truncated_calls += call_total - pos;
                break;
            }

            calls.push(RepoMapCall {
                lsp_location: file_lsp_location(callee_path),
                rank: callee_rank,
            });
            used += call_bytes;
        }

        // Carry unused bytes forward to the next file.
        leftover = budget_in.saturating_sub(used);

        result_files.push(RepoMapFile {
            lsp_location: file_path_lsp,
            rank: file_rank,
            content_kind: content_kind_tag(content_kind_for_path(&file.path)),
            calls,
            symbols,
            truncated_symbols,
            truncated_calls,
        });
    }

    let estimated_bytes = serde_json::to_string(&result_files)
        .map(|s| s.len())
        .unwrap_or(0);

    let budget_exhausted = total_files > result_files.len();

    GetRepoMapResponse {
        files: result_files,
        total_files,
        estimated_bytes,
        budget_bytes: budget_total_bytes,
        budget_exhausted,
        capped: budget_exhausted,
    }
}

/// Render a `PageRank`-sorted JSON map of the repository (4.0.0 compatibility shim).
///
/// This function wraps [`render_json_budgeted`] with a synthetic token budget
/// derived from `max_files * 2000` (a generous per-file allowance). It exists
/// to keep the existing D1/D2 unit tests compiling without change; the MCP
/// layer calls [`render_json_budgeted`] directly in 4.0.1.
///
/// The `capped` field in the response reflects whether the budget was
/// exhausted before all `eligible` files were included, which is equivalent
/// to the previous `total_files > max_files` check.
///
/// When `include_metadata` is `false` (default), files whose extension
/// classifies as [`ContentKind::Meta`] are excluded before ranking.
#[must_use]
pub fn render_json(
    graph: &RepoGraph,
    max_files: usize,
    focus: Option<usize>,
    include_metadata: bool,
) -> GetRepoMapResponse {
    // Synthesise a generous token budget: 2000 tokens per requested file.
    // This ensures the existing D1/D2 tests (which pass small max_files values
    // like 3, 5, 50) see the same file-count behaviour they expect. The test
    // assertions check file counts, not byte sizes, so the exact budget value
    // only matters for ensuring enough headroom.
    let token_budget = max_files.saturating_mul(2000);
    render_json_budgeted(graph, token_budget, focus, include_metadata)
}

// ── Tests ────────────────────────────────────────────────────────────

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_pagerank_simple() {
        // 3-node graph: 0 -> 1 -> 2, 2 -> 0 (cycle)
        let edges = vec![(0, 1, 1), (1, 2, 1), (2, 0, 1)];
        let ranks = pagerank(3, &edges, None);

        // All nodes in a symmetric cycle should have equal rank
        assert_eq!(ranks.len(), 3);
        let sum: f32 = ranks.iter().sum();
        assert!(
            (sum - 1.0).abs() < 0.01,
            "ranks should sum to ~1.0, got {sum}"
        );

        // In a perfect cycle, all ranks should be approximately equal
        let expected = 1.0 / 3.0;
        for (i, &r) in ranks.iter().enumerate() {
            assert!(
                (r - expected).abs() < 0.05,
                "rank[{i}] = {r}, expected ~{expected}"
            );
        }
    }

    #[test]
    fn test_pagerank_star() {
        // Star graph: 0,1,2 all point to 3
        let edges = vec![(0, 3, 1), (1, 3, 1), (2, 3, 1)];
        let ranks = pagerank(4, &edges, None);

        assert_eq!(ranks.len(), 4);
        // Node 3 should have the highest rank
        let max_idx = ranks
            .iter()
            .enumerate()
            .max_by(|a, b| a.1.total_cmp(b.1))
            .unwrap()
            .0;
        assert_eq!(max_idx, 3, "node 3 should have highest rank");
        assert!(
            ranks[3] > ranks[0],
            "rank[3]={} should be > rank[0]={}",
            ranks[3],
            ranks[0]
        );
    }

    #[test]
    fn test_pagerank_topic_sensitive() {
        // 10-node chain: 0 -> 1 -> ... -> 9.
        //
        // With PERSONALIZATION_ALPHA = 0.15 and n = 10, the uniform share per
        // node is 1/10 = 0.10.  The focus node (0) gets 0.15 teleportation
        // mass vs 0.10 uniform, so focused rank[0] > uniform rank[0] holds.
        //
        // The 3-node chain used previously broke when alpha was reduced from
        // 0.70 to 0.15 because 0.15 < 1/3 = 0.33 for n=3 — the focus node
        // received *less* teleportation than its uniform share, inverting the
        // expected direction.  Using n=10 avoids this edge case while still
        // testing the personalization effect.
        let n = 10_usize;
        #[expect(clippy::cast_possible_truncation, reason = "test: n << u32::MAX")]
        let edges: Vec<(u32, u32, u32)> = (0..(n - 1))
            .map(|i| (i as u32, (i + 1) as u32, 1_u32))
            .collect();
        let uniform_ranks = pagerank(n, &edges, None);
        let biased_ranks = pagerank(n, &edges, Some(0));

        // With focus on node 0, it should get a higher rank than uniform
        // because PERSONALIZATION_ALPHA (0.15) > 1/n (0.10) for n=10.
        assert!(
            biased_ranks[0] > uniform_ranks[0],
            "focused rank[0]={} should be > uniform rank[0]={}",
            biased_ranks[0],
            uniform_ranks[0]
        );
    }

    // ── J1 tests — topic-sensitive PageRank soft personalization ─────────

    /// J1 RED: `focus_file` PageRank must not collapse other-file ranks.
    ///
    /// Baseline (pre-4.0.5) concentrated 70% mass on the focus node, producing
    /// a degenerate Dirac delta: focus rank ≈ 0.703, all others ≈ 0.003.
    /// This test fails on the baseline and must pass after the fix.
    ///
    /// Invariant: with `PERSONALIZATION_ALPHA = 0.15`, focus node gets 0.15 of
    /// teleportation mass and each of the other (n-1) nodes gets 0.85/(n-1).
    /// On a star graph with n=10 nodes, the focus node rank must NOT be more
    /// than 40× the average non-focus rank.  The 4.0.5 fix targets roughly
    /// 5-10× for a well-connected graph, so 40× is a conservative upper bound
    /// that the baseline (≈200×) fails.
    #[test]
    fn test_focus_file_topic_pagerank_preserves_rank_dispersion() {
        // Star graph: nodes 1..9 all point to node 0 (high natural rank).
        // Focus on node 1 (low natural rank) to test personalization effect.
        let n = 10_usize;
        #[expect(clippy::cast_possible_truncation, reason = "test: n << u32::MAX")]
        let edges: Vec<(u32, u32, u32)> = (1..n).map(|i| (i as u32, 0_u32, 1_u32)).collect();

        let ranks_focused = pagerank(n, &edges, Some(1));

        let focus_rank = ranks_focused[1];
        let sum_non_focus: f32 = ranks_focused
            .iter()
            .enumerate()
            .filter(|&(i, _)| i != 1)
            .map(|(_, &r)| r)
            .sum();
        let n_non_focus = (n - 1) as f32;
        let avg_non_focus = sum_non_focus / n_non_focus;

        let dispersion_ratio = focus_rank / avg_non_focus;

        eprintln!(
            "J1 dispersion: focus_rank={focus_rank:.6}, avg_non_focus={avg_non_focus:.6}, \
             ratio={dispersion_ratio:.2}× (must be <= 40×)"
        );

        // With 0.15 personalization alpha the focus node's teleportation
        // advantage is modest; 40× is an upper bound the old 0.70 code violates.
        assert!(
            dispersion_ratio <= 40.0,
            "focus rank is {dispersion_ratio:.1}× avg non-focus rank (must be ≤ 40×); \
             pre-fix baseline was ~200× due to 70% concentration — I#16"
        );

        // Ranks must still sum to ~1.
        let total: f32 = ranks_focused.iter().sum();
        assert!(
            (total - 1.0).abs() < 0.01,
            "ranks must sum to ≈1.0; got {total}"
        );
    }

    /// J1 RED: focus node must have the highest rank (it still gets the bias),
    /// but non-focus nodes must NOT collapse to a flat floor.
    ///
    /// Concretely: the second-highest-ranked file must be ≥ 10% of the focus
    /// file's rank (neighborhood rebiasing, not winner-take-all).
    #[test]
    fn test_focus_file_topic_pagerank_does_not_collapse_other_files() {
        // Linear chain: 0 → 1 → 2 → ... → 9 (directed).
        // Focus on node 0.  Without personalization, ranks decrease along the
        // chain.  With soft personalization the non-focus nodes stay non-trivial.
        let n = 10_usize;
        #[expect(clippy::cast_possible_truncation, reason = "test: n << u32::MAX")]
        let edges: Vec<(u32, u32, u32)> = (0..(n - 1))
            .map(|i| (i as u32, (i + 1) as u32, 1_u32))
            .collect();

        let ranks = pagerank(n, &edges, Some(0));

        let focus_rank = ranks[0];
        // All non-focus ranks must be ≥ 10% of focus rank.
        for (i, &r) in ranks.iter().enumerate().skip(1) {
            assert!(
                r >= focus_rank * 0.10,
                "rank[{i}] = {r:.6} is < 10% of focus rank {focus_rank:.6}; \
                 non-focus files must not collapse to near-zero (I#16)"
            );
        }
    }

    // ── J2 tests — neighborhood count parity ─────────────────────────────

    /// J2 RED: `render_json_budgeted` with `focus=Some(i)` must return at
    /// least 80% as many files as the unfocused call with the same budget.
    ///
    /// Baseline collapses the focused run to 1 dominant file + a flat tail
    /// that may still appear, but the intent is no budget waste on zero-signal
    /// entries.  With soft personalization the focused call should return a
    /// similar file count to the unfocused call (within ±20%).
    #[test]
    fn test_focus_file_returns_neighborhood_not_just_focus() {
        // Build a 12-file star graph with meaningful rank variation.
        let n = 12_usize;
        #[expect(clippy::cast_possible_truncation, reason = "test: n << u32::MAX")]
        let edges: Vec<(u32, u32, u32)> = (1..n).map(|i| (i as u32, 0_u32, 1_u32)).collect();
        let base_ranks = pagerank(n, &edges, None);
        let (callers, callees) = build_neighbor_lists(n, &edges);

        let file_nodes: Vec<FileNode> = (0..n)
            .map(|i| FileNode {
                path: format!("src/file_{i}.rs"),
                defs: vec![Definition {
                    name: format!("func_{i}"),
                    kind: "function_item".to_string(),
                    start_line: 1,
                    end_line: 5,
                    scope: String::new(),
                    signature: Some(format!("fn func_{i}() -> i32")),
                    start_byte: 0,
                    end_byte: 100,
                    calls: vec![],
                }],
                imports: vec![],
            })
            .collect();

        let graph = RepoGraph {
            files: file_nodes,
            edges,
            base_ranks,
            callers,
            callees,
            def_edges: vec![],
            def_ranks: vec![],
            def_callers: vec![],
            def_callees: vec![],
            def_offsets: vec![0],
            alpha: 0.5,
        };

        let budget = 2000; // generous budget; all 12 files should fit
        let unfocused = render_json_budgeted(&graph, budget, None, false);
        let focused = render_json_budgeted(&graph, budget, Some(1), false);

        let unfocused_n = unfocused.files.len();
        let focused_n = focused.files.len();
        #[expect(
            clippy::cast_possible_truncation,
            clippy::cast_sign_loss,
            reason = "unfocused_n is a file count (small, positive); f32 multiplication \
                      by 0.80 and ceil produce a value in [0, n]; truncation to usize is safe"
        )]
        let min_expected = (unfocused_n as f32 * 0.80).ceil() as usize;

        eprintln!(
            "J2 neighborhood: unfocused={unfocused_n} files, focused={focused_n} files \
             (need ≥ {min_expected})"
        );

        assert!(
            focused_n >= min_expected,
            "focused call returned {focused_n} files; expected ≥ {min_expected} \
             (80% of unfocused {unfocused_n}); soft personalization must preserve \
             rank dispersion across files (I#16/J2)"
        );
    }

    /// J2 RED: topic delta fingerprinting — focused run must reorder files
    /// relative to unfocused run (focus file surfaces near top), but both
    /// must contain similar total file counts.
    #[test]
    fn test_focus_delta_topic_fingerprinting_works() {
        // Bidirectional 8-file ring so all nodes are structurally equivalent.
        // Without focus all ranks are equal.  With focus on node 3, node 3
        // must surface as the highest-ranked file.
        let n = 8_usize;
        #[expect(clippy::cast_possible_truncation, reason = "test: n << u32::MAX")]
        let edges: Vec<(u32, u32, u32)> = (0..n)
            .flat_map(|i| {
                let next = ((i + 1) % n) as u32;
                let curr = i as u32;
                [(curr, next, 1_u32), (next, curr, 1_u32)]
            })
            .collect();

        let ranks_uniform = pagerank(n, &edges, None);
        let ranks_focused = pagerank(n, &edges, Some(3));

        // Focus node must have highest rank.
        let top_idx = ranks_focused
            .iter()
            .enumerate()
            .max_by(|a, b| a.1.total_cmp(b.1))
            .map(|(i, _)| i)
            .unwrap();

        assert_eq!(
            top_idx, 3,
            "with focus=Some(3), node 3 must have highest rank; top was {top_idx}"
        );

        // Uniform baseline: all ranks should be approximately equal.
        let uniform_max = ranks_uniform
            .iter()
            .copied()
            .fold(f32::NEG_INFINITY, f32::max);
        let uniform_min = ranks_uniform.iter().copied().fold(f32::INFINITY, f32::min);
        assert!(
            (uniform_max - uniform_min).abs() < 0.01,
            "on a ring without focus all ranks should be ≈equal; max={uniform_max:.6} min={uniform_min:.6}"
        );

        // Focused run must rank the focus node significantly higher than others
        // but others must remain non-trivial (≥ 5% of focus).
        let focus_rank = ranks_focused[3];
        for (i, &r) in ranks_focused.iter().enumerate().filter(|&(i, _)| i != 3) {
            assert!(
                r >= focus_rank * 0.05,
                "rank[{i}]={r:.6} is < 5% of focus rank {focus_rank:.6}; \
                 soft personalization must preserve non-focus ranks"
            );
        }
    }

    // ── T1 tests — focus_file resolver normalization (I#20) ──────────────

    /// Build a tiny synthetic graph whose `FileNode::path` values match the
    /// shape `build_graph` produces on disk (no leading `./`, forward slashes).
    fn focus_resolver_graph() -> RepoGraph {
        let file_nodes: Vec<FileNode> = vec![
            FileNode {
                path: "device_opt/services/storage.py".to_string(),
                defs: vec![],
                imports: vec![],
            },
            FileNode {
                path: "device_opt/ui/textual/screens/settings.py".to_string(),
                defs: vec![],
                imports: vec![],
            },
            FileNode {
                path: "device_opt/services/registry.py".to_string(),
                defs: vec![],
                imports: vec![],
            },
            FileNode {
                path: "tests/test_storage.py".to_string(),
                defs: vec![],
                imports: vec![],
            },
        ];
        let n = file_nodes.len();
        RepoGraph {
            files: file_nodes,
            edges: vec![],
            base_ranks: vec![1.0 / n as f32; n],
            callers: vec![vec![]; n],
            callees: vec![vec![]; n],
            def_edges: vec![],
            def_ranks: vec![],
            def_callers: vec![],
            def_callees: vec![],
            def_offsets: vec![0; n + 1],
            alpha: 0.5,
        }
    }

    /// T1: focus_file paths emitted by `lsp_location.file_path` (with the
    /// `./` prefix) must resolve to the correct file index.
    ///
    /// Baseline reproduction (mnemosyne corpus, 4.0.5): passing
    /// `focus_file="./device_opt/.../settings.py"` produced rank values
    /// bit-identical to the unfocused call because the strict-suffix matcher
    /// in `tools.rs` failed both the `exact` and the `strict_suffix` checks
    /// when the focus carried the LSP `./` prefix. The matcher silently
    /// returned `focus = None`, masking the failure as "topic-sensitive
    /// PageRank does nothing on Python".
    #[test]
    fn test_focus_file_resolver_accepts_lsp_location_path() {
        let g = focus_resolver_graph();
        // LSP-shaped path with leading `./` — the form documented in
        // get_repo_map's instructions.
        let res = g.resolve_focus_file("./device_opt/ui/textual/screens/settings.py");
        match res {
            FocusResolution::Found(idx) => {
                assert_eq!(
                    g.files[idx].path, "device_opt/ui/textual/screens/settings.py",
                    "resolver must accept the ./-prefixed LSP path form (I#20)"
                );
            }
            FocusResolution::NotFound | FocusResolution::Ambiguous(_) => {
                panic!(
                    "resolver returned {res:?} for ./device_opt/ui/textual/screens/settings.py; \
                     the LSP-shaped path form must resolve to exactly one file (I#20)"
                );
            }
        }
    }

    /// T1: the bare stored path (no `./`) must continue to resolve.
    /// Regression guard for the pre-fix matcher's "exact" path.
    #[test]
    fn test_focus_file_resolver_accepts_bare_stored_path() {
        let g = focus_resolver_graph();
        let res = g.resolve_focus_file("device_opt/services/storage.py");
        match res {
            FocusResolution::Found(idx) => {
                assert_eq!(g.files[idx].path, "device_opt/services/storage.py");
            }
            other => panic!("expected Found, got {other:?}"),
        }
    }

    /// T1: strict-suffix match — `storage.py` must match
    /// `device_opt/services/storage.py` (prev char is `/`) but ambiguity
    /// (two `storage.py` files) must be reported, not silently picked.
    #[test]
    fn test_focus_file_resolver_strict_suffix_and_ambiguity() {
        let g = focus_resolver_graph();
        // "storage.py" matches both device_opt/services/storage.py and
        // tests/test_storage.py? No — test_storage.py has `_` before `storage.py`
        // (not `/`), so the strict-suffix matcher rejects it. Only one match.
        let res = g.resolve_focus_file("storage.py");
        assert!(
            matches!(res, FocusResolution::Found(_)),
            "strict-suffix `storage.py` must match exactly one file (the `_` in \
             test_storage.py blocks the strict-suffix), got {res:?}"
        );
        // Add a second services/storage.py-shaped file to force ambiguity.
        let mut g2 = g.clone();
        g2.files.push(FileNode {
            path: "vendored/services/storage.py".to_string(),
            defs: vec![],
            imports: vec![],
        });
        g2.base_ranks.push(0.0);
        g2.callers.push(vec![]);
        g2.callees.push(vec![]);
        g2.def_offsets.push(*g2.def_offsets.last().unwrap());
        let res = g2.resolve_focus_file("storage.py");
        match res {
            FocusResolution::Ambiguous(cands) => {
                assert_eq!(cands.len(), 2, "expected two candidates, got {cands:?}");
            }
            other => panic!("expected Ambiguous, got {other:?}"),
        }
    }

    /// T1: a focus that matches no file returns `NotFound`. The caller
    /// is responsible for either treating this as unfocused or surfacing
    /// an error — the resolver itself does not impose policy.
    #[test]
    fn test_focus_file_resolver_not_found() {
        let g = focus_resolver_graph();
        let res = g.resolve_focus_file("./does/not/exist.py");
        assert!(
            matches!(res, FocusResolution::NotFound),
            "expected NotFound, got {res:?}"
        );
    }

    /// T1: empty focus does not match anything (avoids the empty-suffix
    /// degenerate that would otherwise match every file).
    #[test]
    fn test_focus_file_resolver_empty_input_is_not_found() {
        let g = focus_resolver_graph();
        let res = g.resolve_focus_file("");
        assert!(
            matches!(res, FocusResolution::NotFound),
            "empty focus must not match anything, got {res:?}"
        );
    }

    /// T1: focus_file rank delta must be visible on a Python-shaped
    /// synthetic graph.
    ///
    /// Builds a small Python-style call graph (FileNode + Definition with
    /// resolved CallRefs, matching what `extract_calls` produces on a real
    /// Python corpus), runs `build_graph_from_files_pub` to get a
    /// `RepoGraph`, then calls `render_json_budgeted` with and without
    /// focus. Asserts that the focused call changes the rank of at least
    /// one non-focus file by ≥ 5% in either direction.
    ///
    /// On the baseline (pre-T1) this test passes when the caller supplies
    /// an int focus_idx directly — the engine's topic-sensitive PageRank is
    /// correct. The bug was at the string-to-int resolver layer in
    /// `tools.rs`, which silently masked the failure as "the rendering
    /// path doesn't propagate focus". This test locks the engine's
    /// behavior so a future regression in the rendering path is caught.
    #[test]
    #[expect(
        clippy::too_many_lines,
        reason = "synthetic Python-shaped graph (five FileNodes with defs + \
                  CallRefs + ImportRefs) plus the two-call assertion sequence \
                  is inherently long; splitting into helpers would obscure the \
                  one-shot reproduction the test is locking in."
    )]
    fn test_focus_file_rank_delta_visible_on_python_corpus() {
        // Five files, Python-shaped: services/storage.py (a "hub" that two
        // UI files call into) plus a tests/ file. The Python tree-sitter
        // extractor produces `class_definition` and `function_definition`
        // kinds with resolved CallRefs pointing at the hub.
        let mut files: Vec<FileNode> = vec![
            FileNode {
                path: "device_opt/services/storage.py".to_string(),
                defs: vec![
                    Definition {
                        name: "ScanStore".to_string(),
                        kind: "class_definition".to_string(),
                        start_line: 1,
                        end_line: 80,
                        scope: String::new(),
                        signature: None,
                        start_byte: 0,
                        end_byte: 2000,
                        calls: vec![],
                    },
                    Definition {
                        name: "save_scan".to_string(),
                        kind: "function_definition".to_string(),
                        start_line: 20,
                        end_line: 40,
                        scope: "class_definition ScanStore".to_string(),
                        signature: Some("def save_scan(self, scan)".to_string()),
                        start_byte: 200,
                        end_byte: 600,
                        calls: vec![],
                    },
                ],
                imports: vec![],
            },
            FileNode {
                path: "device_opt/services/registry.py".to_string(),
                defs: vec![Definition {
                    name: "register".to_string(),
                    kind: "function_definition".to_string(),
                    start_line: 1,
                    end_line: 30,
                    scope: String::new(),
                    signature: Some("def register(svc)".to_string()),
                    start_byte: 0,
                    end_byte: 600,
                    calls: vec![CallRef {
                        name: "save_scan".to_string(),
                        qualified_path: None,
                        receiver_type: None,
                        byte_offset: 100,
                        resolved: None,
                    }],
                }],
                imports: vec![ImportRef {
                    raw_path: "from device_opt.services import storage".to_string(),
                    resolved_idx: Some(0),
                }],
            },
            FileNode {
                path: "device_opt/ui/screens/browse.py".to_string(),
                defs: vec![Definition {
                    name: "browse_scans".to_string(),
                    kind: "function_definition".to_string(),
                    start_line: 1,
                    end_line: 50,
                    scope: String::new(),
                    signature: Some("def browse_scans(app)".to_string()),
                    start_byte: 0,
                    end_byte: 1000,
                    calls: vec![CallRef {
                        name: "save_scan".to_string(),
                        qualified_path: None,
                        receiver_type: None,
                        byte_offset: 200,
                        resolved: None,
                    }],
                }],
                imports: vec![ImportRef {
                    raw_path: "from device_opt.services import storage".to_string(),
                    resolved_idx: Some(0),
                }],
            },
            FileNode {
                path: "device_opt/ui/screens/settings.py".to_string(),
                defs: vec![Definition {
                    name: "open_settings".to_string(),
                    kind: "function_definition".to_string(),
                    start_line: 1,
                    end_line: 40,
                    scope: String::new(),
                    signature: Some("def open_settings(app)".to_string()),
                    start_byte: 0,
                    end_byte: 800,
                    calls: vec![CallRef {
                        name: "register".to_string(),
                        qualified_path: None,
                        receiver_type: None,
                        byte_offset: 150,
                        resolved: None,
                    }],
                }],
                imports: vec![ImportRef {
                    raw_path: "from device_opt.services import registry".to_string(),
                    resolved_idx: Some(1),
                }],
            },
            FileNode {
                path: "tests/test_storage.py".to_string(),
                defs: vec![Definition {
                    name: "test_save".to_string(),
                    kind: "function_definition".to_string(),
                    start_line: 1,
                    end_line: 20,
                    scope: String::new(),
                    signature: Some("def test_save()".to_string()),
                    start_byte: 0,
                    end_byte: 400,
                    calls: vec![CallRef {
                        name: "save_scan".to_string(),
                        qualified_path: None,
                        receiver_type: None,
                        byte_offset: 50,
                        resolved: None,
                    }],
                }],
                imports: vec![ImportRef {
                    raw_path: "from device_opt.services import storage".to_string(),
                    resolved_idx: Some(0),
                }],
            },
        ];

        // Resolve calls so the graph builder has edges to chew on.
        let def_index = build_def_index(&files);
        resolve_calls(&mut files, &def_index, &HashMap::new());
        let graph = build_graph_from_files_pub(files);

        // Sanity: the graph must have edges (the calls were resolved).
        assert!(
            !graph.edges.is_empty(),
            "Python-shaped synthetic graph must produce file-level edges; got 0. \
             The CallRefs may have failed to resolve."
        );

        // Resolve the focus file via the new helper.
        let focus_idx = match graph.resolve_focus_file("./device_opt/ui/screens/settings.py") {
            FocusResolution::Found(i) => i,
            other => panic!("resolver must find settings.py via LSP-shaped path, got {other:?}"),
        };

        let budget = 4000;
        let unfocused = render_json_budgeted(&graph, budget, None, false);
        let focused = render_json_budgeted(&graph, budget, Some(focus_idx), false);

        // Collect rank-by-path maps for both runs.
        let unfocused_ranks: std::collections::HashMap<String, f32> = unfocused
            .files
            .iter()
            .map(|f| (f.lsp_location.file_path.clone(), f.rank))
            .collect();
        let focused_ranks: std::collections::HashMap<String, f32> = focused
            .files
            .iter()
            .map(|f| (f.lsp_location.file_path.clone(), f.rank))
            .collect();

        eprintln!("T1 Python — unfocused ranks: {unfocused_ranks:#?}");
        eprintln!("T1 Python — focused ranks:   {focused_ranks:#?}");

        // Find at least one non-focus file whose rank changed by ≥ 5% in
        // either direction. The threshold is conservative; the soft 0.15
        // personalization alpha redistributes mass enough that on this
        // 5-node graph the affected neighbors typically shift by 20%+.
        let focus_path = "./device_opt/ui/screens/settings.py";
        let mut max_delta_ratio = 0.0_f32;
        for (path, &u_rank) in &unfocused_ranks {
            if path == focus_path {
                continue;
            }
            if let Some(&f_rank) = focused_ranks.get(path)
                && u_rank > 0.0
            {
                let ratio = (f_rank - u_rank).abs() / u_rank;
                if ratio > max_delta_ratio {
                    max_delta_ratio = ratio;
                }
            }
        }
        assert!(
            max_delta_ratio >= 0.05,
            "focus_file must rebias non-focus file ranks by ≥ 5%; \
             max observed delta ratio = {max_delta_ratio:.3} \
             (I#20: focus_file invisible on Python corpora)"
        );

        // Bit-identity guard: at least one non-focus file's rank must NOT
        // equal its unfocused value. This is the pathology from the
        // mnemosyne reproduction: every rank value was bit-identical
        // across global/focused calls.
        let any_changed = unfocused_ranks.iter().any(|(path, &u_rank)| {
            path != focus_path
                && focused_ranks
                    .get(path)
                    .is_some_and(|&f_rank| f_rank.to_bits() != u_rank.to_bits())
        });
        assert!(
            any_changed,
            "no non-focus file rank changed across focused/unfocused calls — \
             bit-identical pathology (I#20). unfocused={unfocused_ranks:#?} \
             focused={focused_ranks:#?}"
        );
    }

    /// T1: focus_file rank delta on a Rust-shaped synthetic graph.
    ///
    /// Regression test: confirms the engine's topic-sensitive PageRank
    /// works on Rust shapes (where T1's investigation found it already
    /// works, but the resolver fix must not break the existing path).
    ///
    /// This complements `test_focus_file_returns_neighborhood_not_just_focus`
    /// by additionally checking that (a) the resolver accepts a Rust path
    /// with the `./` LSP prefix, and (b) at least one non-focus file's
    /// rank moves by ≥ 5%.
    #[test]
    #[expect(
        clippy::too_many_lines,
        reason = "synthetic Rust-shaped graph with four FileNodes plus the \
                  two-call assertion sequence inherently exceeds the 100-line \
                  cap; the test mirrors the Python-shaped sibling."
    )]
    fn test_focus_file_rank_delta_preserved_on_rust_corpus() {
        let mut files: Vec<FileNode> = vec![
            FileNode {
                path: "src/lib.rs".to_string(),
                defs: vec![Definition {
                    name: "Engine".to_string(),
                    kind: "struct_item".to_string(),
                    start_line: 1,
                    end_line: 30,
                    scope: String::new(),
                    signature: None,
                    start_byte: 0,
                    end_byte: 600,
                    calls: vec![],
                }],
                imports: vec![],
            },
            FileNode {
                path: "src/encoder/mod.rs".to_string(),
                defs: vec![Definition {
                    name: "encode".to_string(),
                    kind: "function_item".to_string(),
                    start_line: 1,
                    end_line: 40,
                    scope: String::new(),
                    signature: Some("fn encode(input: &str) -> Vec<f32>".to_string()),
                    start_byte: 0,
                    end_byte: 800,
                    calls: vec![],
                }],
                imports: vec![ImportRef {
                    raw_path: "use crate::lib;".to_string(),
                    resolved_idx: Some(0),
                }],
            },
            FileNode {
                path: "src/search.rs".to_string(),
                defs: vec![Definition {
                    name: "search".to_string(),
                    kind: "function_item".to_string(),
                    start_line: 1,
                    end_line: 30,
                    scope: String::new(),
                    signature: Some("fn search(q: &str) -> Hits".to_string()),
                    start_byte: 0,
                    end_byte: 600,
                    calls: vec![CallRef {
                        name: "encode".to_string(),
                        qualified_path: None,
                        receiver_type: None,
                        byte_offset: 100,
                        resolved: None,
                    }],
                }],
                imports: vec![ImportRef {
                    raw_path: "use crate::encoder;".to_string(),
                    resolved_idx: Some(1),
                }],
            },
            FileNode {
                path: "src/cli.rs".to_string(),
                defs: vec![Definition {
                    name: "main".to_string(),
                    kind: "function_item".to_string(),
                    start_line: 1,
                    end_line: 20,
                    scope: String::new(),
                    signature: Some("fn main()".to_string()),
                    start_byte: 0,
                    end_byte: 400,
                    calls: vec![CallRef {
                        name: "search".to_string(),
                        qualified_path: None,
                        receiver_type: None,
                        byte_offset: 50,
                        resolved: None,
                    }],
                }],
                imports: vec![ImportRef {
                    raw_path: "use crate::search;".to_string(),
                    resolved_idx: Some(2),
                }],
            },
        ];

        let def_index = build_def_index(&files);
        resolve_calls(&mut files, &def_index, &HashMap::new());
        let graph = build_graph_from_files_pub(files);

        assert!(
            !graph.edges.is_empty(),
            "Rust-shaped synthetic graph must produce edges"
        );

        let focus_idx = match graph.resolve_focus_file("./src/encoder/mod.rs") {
            FocusResolution::Found(i) => i,
            other => panic!("resolver must find encoder/mod.rs via LSP path, got {other:?}"),
        };

        let budget = 4000;
        let unfocused = render_json_budgeted(&graph, budget, None, false);
        let focused = render_json_budgeted(&graph, budget, Some(focus_idx), false);

        let unfocused_ranks: std::collections::HashMap<String, f32> = unfocused
            .files
            .iter()
            .map(|f| (f.lsp_location.file_path.clone(), f.rank))
            .collect();
        let focused_ranks: std::collections::HashMap<String, f32> = focused
            .files
            .iter()
            .map(|f| (f.lsp_location.file_path.clone(), f.rank))
            .collect();

        eprintln!("T1 Rust — unfocused: {unfocused_ranks:#?}");
        eprintln!("T1 Rust — focused:   {focused_ranks:#?}");

        let focus_path = "./src/encoder/mod.rs";
        let mut max_delta_ratio = 0.0_f32;
        for (path, &u_rank) in &unfocused_ranks {
            if path == focus_path {
                continue;
            }
            if let Some(&f_rank) = focused_ranks.get(path)
                && u_rank > 0.0
            {
                let ratio = (f_rank - u_rank).abs() / u_rank;
                if ratio > max_delta_ratio {
                    max_delta_ratio = ratio;
                }
            }
        }
        assert!(
            max_delta_ratio >= 0.05,
            "focus_file must rebias non-focus file ranks by ≥ 5% on Rust shapes; \
             max observed delta = {max_delta_ratio:.3}"
        );
    }

    #[test]
    fn test_pagerank_empty() {
        let ranks = pagerank(0, &[], None);
        assert!(ranks.is_empty());
    }

    #[test]
    fn test_render_tiers() {
        // Build a small graph with 10 files to exercise all tiers
        let files: Vec<FileNode> = (0..10)
            .map(|i| FileNode {
                path: format!("src/file_{i}.rs"),
                defs: vec![Definition {
                    name: format!("func_{i}"),
                    kind: "function_item".to_string(),
                    start_line: 1,
                    end_line: 5,
                    scope: String::new(),
                    signature: Some(format!("func_{i}(x: i32) -> i32")),
                    start_byte: 0,
                    end_byte: 0,
                    calls: vec![],
                }],
                imports: vec![],
            })
            .collect();

        // Create a star graph: files 1-9 all import from file 0
        let edges: Vec<(u32, u32, u32)> = (1..10).map(|i| (i, 0, 1)).collect();
        let base_ranks = pagerank(10, &edges, None);
        let (top_callers, top_callees) = build_neighbor_lists(10, &edges);

        let graph = RepoGraph {
            files,
            edges,
            base_ranks,
            callers: top_callers,
            callees: top_callees,
            def_edges: vec![],
            def_ranks: vec![],
            def_callers: vec![],
            def_callees: vec![],
            def_offsets: vec![0],
            alpha: 0.5,
        };

        // Large budget: should include all files
        let full = render(&graph, 10_000, None);
        assert!(
            full.contains("file_0"),
            "output should contain the top-ranked file"
        );
        // file_0 should appear as tier 0 (highest rank)
        assert!(
            full.contains("## src/file_0.rs"),
            "top file should have tier 0 heading"
        );

        // Tiny budget: should only fit a few files
        let small = render(&graph, 10, None);
        assert!(
            !small.is_empty(),
            "even tiny budget should produce some output"
        );
        // Should have fewer entries than full render
        let full_lines = full.lines().count();
        let small_lines = small.lines().count();
        assert!(
            small_lines < full_lines,
            "small budget ({small_lines} lines) should have fewer lines than full ({full_lines})"
        );
    }

    #[test]
    fn test_render_empty_graph() {
        let graph = RepoGraph {
            files: vec![],
            edges: vec![],
            base_ranks: vec![],
            callers: vec![],
            callees: vec![],
            def_edges: vec![],
            def_ranks: vec![],
            def_callers: vec![],
            def_callees: vec![],
            def_offsets: vec![0],
            alpha: 0.5,
        };
        let output = render(&graph, 1000, None);
        assert!(output.is_empty(), "empty graph should render empty string");
    }

    #[test]
    fn test_build_graph_on_fixtures() {
        let fixtures = Path::new(env!("CARGO_MANIFEST_DIR"))
            .parent()
            .unwrap()
            .parent()
            .unwrap()
            .join("tests")
            .join("fixtures");

        let graph = build_graph(&fixtures).expect("build_graph should succeed on fixtures");

        // Should find at least the 3 fixture files
        assert!(
            !graph.files.is_empty(),
            "graph should contain files from fixtures"
        );

        // Should find definitions in the Rust fixture
        let rs_file = graph.files.iter().find(|f| f.path.ends_with("sample.rs"));
        assert!(rs_file.is_some(), "should find sample.rs");
        let rs_file = rs_file.unwrap();
        assert!(
            !rs_file.defs.is_empty(),
            "sample.rs should have definitions"
        );
        assert!(
            rs_file.defs.iter().any(|d| d.name == "hello"),
            "should find 'hello' function in sample.rs"
        );

        // Should find definitions in the Python fixture
        let py_file = graph.files.iter().find(|f| f.path.ends_with("sample.py"));
        assert!(py_file.is_some(), "should find sample.py");
        let py_file = py_file.unwrap();
        assert!(
            !py_file.defs.is_empty(),
            "sample.py should have definitions"
        );
        assert!(
            py_file.defs.iter().any(|d| d.name == "greet"),
            "should find 'greet' function in sample.py"
        );

        // PageRank scores should be computed
        assert_eq!(graph.base_ranks.len(), graph.files.len());
        let sum: f32 = graph.base_ranks.iter().sum();
        assert!(
            (sum - 1.0).abs() < 0.01,
            "PageRank scores should sum to ~1.0, got {sum}"
        );
    }

    #[test]
    fn test_extract_imports_rust() {
        let source = "use crate::foo::bar;\nuse std::collections::HashMap;\n";
        let (lang, query) = import_query_for_extension("rs").unwrap();
        let imports = extract_imports(source, &lang, &query);
        assert_eq!(imports.len(), 2);
        assert!(imports[0].contains("crate::foo::bar"));
    }

    #[test]
    fn test_extract_imports_python_stub() {
        let source = "from typing import Protocol\nimport pkg.types\n";
        let (lang, query) = import_query_for_extension("pyi").unwrap();
        let imports = extract_imports(source, &lang, &query);
        assert_eq!(imports.len(), 2);
        assert!(imports[0].contains("from typing import Protocol"));
        assert!(imports[1].contains("import pkg.types"));
    }

    #[test]
    fn test_resolve_python_import_to_stub_file() {
        let root = PathBuf::from("/project");
        let mut file_index = HashMap::new();
        file_index.insert(PathBuf::from("/project/pkg/types.pyi"), 1);

        let result = resolve_python_import("import pkg.types", &root, &file_index);
        assert_eq!(result, Some(1));
    }

    #[test]
    fn test_resolve_rust_crate_import() {
        let root = PathBuf::from("/project");
        let file_path = PathBuf::from("/project/src/main.rs");
        let mut file_index = HashMap::new();
        file_index.insert(PathBuf::from("/project/src/foo/bar.rs"), 1);
        file_index.insert(PathBuf::from("/project/src/main.rs"), 0);

        let result = resolve_rust_import("use crate::foo::bar;", &file_path, &root, &file_index);
        assert_eq!(result, Some(1));
    }

    #[test]
    fn test_resolve_rust_external_crate_dropped() {
        let root = PathBuf::from("/project");
        let file_path = PathBuf::from("/project/src/main.rs");
        let file_index = HashMap::new();

        let result = resolve_rust_import(
            "use std::collections::HashMap;",
            &file_path,
            &root,
            &file_index,
        );
        assert_eq!(result, None, "external crate imports should be dropped");
    }

    #[test]
    fn test_neighbor_lists() {
        // 0 -> 1, 0 -> 2, 1 -> 2
        let edges = vec![(0, 1, 1), (0, 2, 1), (1, 2, 1)];
        let (incoming, outgoing) = build_neighbor_lists(3, &edges);

        // Node 2 should be called by 0 and 1
        assert!(incoming[2].contains(&0));
        assert!(incoming[2].contains(&1));

        // Node 0 should call 1 and 2
        assert!(outgoing[0].contains(&1));
        assert!(outgoing[0].contains(&2));
    }

    /// G1 (R2.3 issue a): A scoped call `mod_a::foo()` must store:
    /// - `name = "foo"` (bare identifier, for def-index lookup)
    /// - `qualified_path = Some("mod_a::foo")` (full path, for disambiguation)
    ///
    /// Before G1, `name` stored the full `"mod_a::foo"` path. After G1, `name`
    /// is always the bare trailing identifier and `qualified_path` carries the
    /// full path when the call is scoped.
    #[test]
    fn test_scoped_identifier_calls_preserve_path() {
        use crate::languages;
        use streaming_iterator::StreamingIterator as _;

        let source = "
mod mod_a {
    pub fn foo() {}
}
mod mod_b {
    pub fn foo() {}
}
fn caller() {
    mod_a::foo();
    mod_b::foo();
}
";
        let call_config =
            languages::call_query_for_extension("rs").expect("Rust call config must exist");
        let lang_config =
            languages::config_for_extension("rs").expect("Rust lang config must exist");

        let mut defs = {
            let mut parser = tree_sitter::Parser::new();
            parser.set_language(&lang_config.language).unwrap();
            let tree = parser.parse(source, None).unwrap();
            let mut cursor = tree_sitter::QueryCursor::new();
            let mut out = Vec::new();
            let mut matches =
                cursor.matches(&lang_config.query, tree.root_node(), source.as_bytes());
            while let Some(m) = matches.next() {
                let mut name = String::new();
                let mut def_node = None;
                for cap in m.captures {
                    let cname = &lang_config.query.capture_names()[cap.index as usize];
                    if *cname == "name" {
                        name = source[cap.node.start_byte()..cap.node.end_byte()].to_string();
                    } else if *cname == "def" {
                        def_node = Some(cap.node);
                    }
                }
                if let Some(node) = def_node {
                    #[expect(clippy::cast_possible_truncation)]
                    out.push(Definition {
                        name,
                        kind: node.kind().to_string(),
                        start_line: node.start_position().row as u32 + 1,
                        end_line: node.end_position().row as u32 + 1,
                        scope: String::new(),
                        signature: None,
                        start_byte: node.start_byte() as u32,
                        end_byte: node.end_byte() as u32,
                        calls: vec![],
                    });
                }
            }
            out
        };

        extract_calls(source, &call_config, &mut defs);

        // Find the `caller` function definition
        let caller_def = defs
            .iter()
            .find(|d| d.name == "caller")
            .expect("caller def");

        // G1: bare name is "foo", qualified_path carries the module path.
        let call_names: Vec<&str> = caller_def.calls.iter().map(|c| c.name.as_str()).collect();
        let qualified_paths: Vec<Option<&str>> = caller_def
            .calls
            .iter()
            .map(|c| c.qualified_path.as_deref())
            .collect();

        // Bare names must be the trailing identifier only.
        assert!(
            call_names.contains(&"foo"),
            "bare name 'foo' must appear for scoped calls; got: {call_names:?}"
        );
        // Qualified paths must carry the full scope.
        assert!(
            qualified_paths.contains(&Some("mod_a::foo")),
            "qualified_path 'mod_a::foo' must appear; got: {qualified_paths:?}"
        );
        assert!(
            qualified_paths.contains(&Some("mod_b::foo")),
            "qualified_path 'mod_b::foo' must appear; got: {qualified_paths:?}"
        );
        // Full paths must NOT appear in bare names.
        assert!(
            !call_names.contains(&"mod_a::foo"),
            "full path 'mod_a::foo' must not appear in bare name; got: {call_names:?}"
        );
    }

    /// RED test (R2.3 issue b+c): Two defs named `Read` in different modules,
    /// an unqualified call to `Read`. Resolution must NOT silently pick the first.
    /// Either both are returned (ambiguous) or none.
    #[test]
    fn test_ambiguous_name_resolution_returns_all_or_none() {
        // Build two FileNodes each with a def named "Read", then a third with an
        // unqualified call to "Read".
        let file_a = FileNode {
            path: "mod_a.rs".to_string(),
            defs: vec![Definition {
                name: "Read".to_string(),
                kind: "trait_item".to_string(),
                start_line: 1,
                end_line: 3,
                scope: String::new(),
                signature: None,
                start_byte: 0,
                end_byte: 50,
                calls: vec![],
            }],
            imports: vec![],
        };
        let file_b = FileNode {
            path: "mod_b.rs".to_string(),
            defs: vec![Definition {
                name: "Read".to_string(),
                kind: "trait_item".to_string(),
                start_line: 1,
                end_line: 3,
                scope: String::new(),
                signature: None,
                start_byte: 0,
                end_byte: 50,
                calls: vec![],
            }],
            imports: vec![],
        };
        let file_c = FileNode {
            path: "caller.rs".to_string(),
            defs: vec![Definition {
                name: "do_thing".to_string(),
                kind: "function_item".to_string(),
                start_line: 1,
                end_line: 5,
                scope: String::new(),
                signature: None,
                start_byte: 0,
                end_byte: 100,
                calls: vec![CallRef {
                    name: "Read".to_string(),
                    qualified_path: None,
                    receiver_type: None,
                    byte_offset: 10,
                    resolved: None,
                }],
            }],
            imports: vec![],
        };

        let mut files = vec![file_a, file_b, file_c];
        let def_index = build_def_index(&files);
        resolve_calls(&mut files, &def_index, &HashMap::new());

        // The unqualified call to "Read" is ambiguous (two candidates, neither in same
        // file nor imported). Resolution must leave it as None — silent first-wins is wrong.
        let resolved = files[2].defs[0].calls[0].resolved;
        assert_eq!(
            resolved, None,
            "ambiguous unqualified call with no import context must resolve to None, not silently pick first"
        );
    }

    // ── D1 / D2 tests ────────────────────────────────────────────────

    /// Build a small test graph with N files and an optional JSON-extension file.
    fn build_test_graph(n_code: usize, include_json: bool) -> (RepoGraph, Vec<usize>) {
        let mut file_nodes: Vec<FileNode> = (0..n_code)
            .map(|i| FileNode {
                path: format!("src/file_{i}.rs"),
                defs: vec![
                    Definition {
                        name: format!("func_{i}"),
                        kind: "function_item".to_string(),
                        start_line: 1,
                        end_line: 5,
                        scope: String::new(),
                        signature: Some(format!("fn func_{i}() -> i32")),
                        start_byte: 0,
                        end_byte: 100,
                        calls: vec![],
                    },
                    Definition {
                        name: format!("MyStruct{i}"),
                        kind: "struct_item".to_string(),
                        start_line: 7,
                        end_line: 10,
                        scope: String::new(),
                        signature: None,
                        start_byte: 110,
                        end_byte: 200,
                        calls: vec![],
                    },
                ],
                imports: vec![],
            })
            .collect();

        let json_idx = if include_json {
            let idx = file_nodes.len();
            file_nodes.push(FileNode {
                path: "data/config.json".to_string(),
                defs: vec![],
                imports: vec![],
            });
            vec![idx]
        } else {
            vec![]
        };

        // Build a star graph: all code files point to file_0.
        let n = file_nodes.len();
        #[expect(clippy::cast_possible_truncation, reason = "test: n_code << u32::MAX")]
        let edges: Vec<(u32, u32, u32)> = (1..n_code).map(|i| (i as u32, 0, 1)).collect();

        let base_ranks = pagerank(n, &edges, None);
        let (callers, callees) = build_neighbor_lists(n, &edges);

        let graph = RepoGraph {
            files: file_nodes,
            edges,
            base_ranks,
            callers,
            callees,
            def_edges: vec![],
            def_ranks: vec![],
            def_callers: vec![],
            def_callees: vec![],
            def_offsets: vec![0],
            alpha: 0.5,
        };

        (graph, json_idx)
    }

    /// D1: `render_json` returns a `GetRepoMapResponse` with a `files` array.
    ///
    /// On the baseline (before D1) `get_repo_map_ripvec` returned markdown prose via
    /// `repo_map::render`; no `files` key existed in the output.
    #[test]
    fn get_repo_map_returns_json_with_files_array() {
        let (graph, _) = build_test_graph(5, false);
        let response = render_json(&graph, 50, None, false);
        assert!(
            !response.files.is_empty(),
            "files array should be non-empty for a non-empty graph"
        );
        // Serialize and verify the JSON shape has a `files` key.
        let json = serde_json::to_string(&response).expect("serialize");
        let parsed: serde_json::Value = serde_json::from_str(&json).expect("parse");
        assert!(
            parsed["files"].is_array(),
            "serialized response must have a `files` JSON array; got: {parsed}"
        );
    }

    /// D1: every file entry has an `lsp_location` field.
    ///
    /// Before D1, output was prose text; no `lsp_location` existed anywhere in the response.
    #[test]
    fn get_repo_map_each_file_has_lsp_location() {
        let (graph, _) = build_test_graph(5, false);
        let response = render_json(&graph, 50, None, false);
        for file in &response.files {
            assert!(
                !file.lsp_location.file_path.is_empty(),
                "each file must have a non-empty lsp_location.file_path"
            );
        }
        // Also verify through JSON.
        let json = serde_json::to_string(&response).expect("serialize");
        let parsed: serde_json::Value = serde_json::from_str(&json).expect("parse");
        for entry in parsed["files"].as_array().expect("files array") {
            assert!(
                entry["lsp_location"]["file_path"].is_string(),
                "each file entry must have lsp_location.file_path string; entry: {entry}"
            );
        }
    }

    /// D1: every symbol has a `kind` (u32) and an `lsp_location`.
    ///
    /// Before D1 symbols were rendered as prose strings like `"function_item func_0"`.
    #[test]
    fn get_repo_map_each_symbol_has_kind_and_lsp_location() {
        let (graph, _) = build_test_graph(3, false);
        let response = render_json(&graph, 50, None, false);
        for file in &response.files {
            for sym in &file.symbols {
                assert!(
                    sym.kind > 0,
                    "symbol kind must be a positive LSP SymbolKind; got 0 for '{}'",
                    sym.name
                );
                assert!(
                    !sym.lsp_location.file_path.is_empty(),
                    "symbol must have lsp_location.file_path"
                );
            }
        }
        // Verify through JSON: kind should be a number.
        let json = serde_json::to_string(&response).expect("serialize");
        let parsed: serde_json::Value = serde_json::from_str(&json).expect("parse");
        for file_entry in parsed["files"].as_array().expect("files") {
            for sym_entry in file_entry["symbols"].as_array().expect("symbols") {
                assert!(
                    sym_entry["kind"].is_number(),
                    "symbol `kind` must be a JSON number; sym: {sym_entry}"
                );
                assert!(
                    sym_entry["lsp_location"]["file_path"].is_string(),
                    "symbol must have lsp_location.file_path; sym: {sym_entry}"
                );
            }
        }
    }

    /// D1: `calls` field is an array of `RepoMapCall`-shaped objects (each has
    /// `lsp_location` and `rank`).
    ///
    /// In 4.0.1 calls moved from bare `lsp_location` objects to `RepoMapCall`
    /// objects that carry both the target `lsp_location` and the target file's
    /// `base_rank`.
    #[test]
    fn get_repo_map_calls_field_is_array_of_lsp_locations() {
        // Build a 5-file star graph so file_0 has non-empty callees.
        let (graph, _) = build_test_graph(5, false);
        let response = render_json(&graph, 50, None, false);
        let json = serde_json::to_string(&response).expect("serialize");
        let parsed: serde_json::Value = serde_json::from_str(&json).expect("parse");
        for file_entry in parsed["files"].as_array().expect("files") {
            let calls = file_entry["calls"]
                .as_array()
                .expect("calls must be an array");
            for call in calls {
                // In 4.0.1 each call entry is a RepoMapCall with lsp_location + rank.
                assert!(
                    call["lsp_location"]["file_path"].is_string(),
                    "each call entry must have lsp_location.file_path string; call: {call}"
                );
                assert!(
                    call["rank"].is_number(),
                    "each call entry must have a numeric rank; call: {call}"
                );
            }
        }
    }

    /// D2 / G3: `render_json_budgeted` with a very tight budget returns fewer files.
    ///
    /// Before the budget allocator, `max_files=3` controlled file count but not
    /// per-file expansion. In 4.0.1 the token_budget controls total bytes; with
    /// a budget of 1 token (= 4 bytes) only the envelope minimum allows any file
    /// at all, and the test verifies that the total_files counter still reflects
    /// the full eligible count. `render_json` (compat shim) passes a generous
    /// budget; use `render_json_budgeted` with a tight budget to verify the cap.
    #[test]
    fn get_repo_map_returns_at_most_max_files_files() {
        let (graph, _) = build_test_graph(10, false);
        // Use render_json_budgeted directly with a tight budget (600 bytes = 3 files
        // × 200-byte floor). Each file's envelope minimum is 200 bytes so a 600-byte
        // budget should admit at most 3 files.
        let response = render_json_budgeted(&graph, 150, None, false);
        assert!(
            response.files.len() <= 3,
            "files.len() = {} must be <= 3 for a 600-byte budget",
            response.files.len()
        );
        assert_eq!(
            response.total_files, 10,
            "total_files must reflect the full eligible count before budget cap"
        );
        assert!(
            response.capped,
            "capped must be true when total_files > files.len()"
        );
    }

    /// D2: `include_metadata=false` (default) excludes JSON/TOML/etc. files.
    ///
    /// Before D2, JSON files with thousands of repeated keys dominated the
    /// output (Issue #5 — JSON-key flooding).
    #[test]
    fn get_repo_map_excludes_meta_by_default() {
        let (graph, _) = build_test_graph(3, /*include_json=*/ true);
        // Default: include_metadata = false
        let response = render_json(&graph, 50, None, false);
        for file in &response.files {
            assert!(
                !std::path::Path::new(&file.lsp_location.file_path)
                    .extension()
                    .is_some_and(|e| e.eq_ignore_ascii_case("json")),
                "JSON (Meta) files must be excluded when include_metadata=false; found: {}",
                file.lsp_location.file_path
            );
        }
    }

    /// D2: `include_metadata=true` includes JSON files.
    ///
    /// Callers who opt-in to metadata should see all content kinds.
    #[test]
    fn get_repo_map_include_metadata_true_includes_json() {
        let (graph, _) = build_test_graph(3, /*include_json=*/ true);
        let response = render_json(&graph, 50, None, true);
        let has_json = response.files.iter().any(|f| {
            std::path::Path::new(&f.lsp_location.file_path)
                .extension()
                .is_some_and(|e| e.eq_ignore_ascii_case("json"))
        });
        assert!(
            has_json,
            "JSON file must be present when include_metadata=true"
        );
    }

    /// J1/J2 MEASUREMENT: flask corpus focus_file=blueprints.py rank dispersion.
    ///
    /// Mandatory measurement from the 4.0.5 Wave-2 Front-C briefing:
    /// - `len(files) >= 8` (not collapsed to just the focus)
    /// - focus file rank is the highest in the response
    /// - next 5 files all have rank >= 10% of focus rank
    /// - neighborhood contains semantically related files (app.py, scaffold.py)
    #[test]
    #[ignore = "runs on flask corpus at tests/corpus/code/flask; use --ignored --nocapture"]
    #[expect(
        clippy::too_many_lines,
        reason = "end-to-end corpus measurement test; assertion sequence is sequential and cannot be meaningfully split"
    )]
    fn test_flask_focus_blueprints_rank_dispersion() {
        let corpus_root = Path::new(env!("CARGO_MANIFEST_DIR"))
            .parent()
            .unwrap()
            .parent()
            .unwrap()
            .join("tests/corpus/code/flask");

        assert!(
            corpus_root.exists(),
            "flask corpus not found at {}",
            corpus_root.display()
        );

        let graph = build_graph(&corpus_root).expect("build_graph on flask corpus");
        eprintln!("Flask corpus: {} files in graph", graph.files.len());

        // Find focus file
        let focus_path = "src/flask/blueprints.py";
        let focus_idx = graph.files.iter().position(|f| f.path == focus_path);
        eprintln!("Focus file '{focus_path}' -> idx: {focus_idx:?}");
        assert!(
            focus_idx.is_some(),
            "blueprints.py not found in graph; available files: {:?}",
            graph
                .files
                .iter()
                .map(|f| &f.path)
                .take(20)
                .collect::<Vec<_>>()
        );

        let response = render_json_budgeted(&graph, 4000, focus_idx, false);

        // Criterion 1: at least 8 files returned.
        eprintln!(
            "Focused response: {} files (total_files={})",
            response.files.len(),
            response.total_files
        );
        assert!(
            response.files.len() >= 8,
            "expected >= 8 files in focused response; got {} — I#16 winner-take-all collapse",
            response.files.len()
        );

        // Print top 10 for inspection.
        eprintln!("\nTop 10 focused files:");
        for (i, f) in response.files.iter().take(10).enumerate() {
            eprintln!("  [{i}] rank={:.6}  {}", f.rank, f.lsp_location.file_path);
        }

        // Criterion 2: focus file must appear near the top (top-3) of focused
        // results.  With PERSONALIZATION_ALPHA=0.15 and the flask corpus,
        // src/flask/app.py has higher structural rank than blueprints.py and
        // may legitimately rank #1 — the focus boosts blueprints.py relative
        // to its unfocused position, but doesn't guarantee it beats every
        // structurally central neighbor.  Being in top-3 confirms the bias
        // is working (pre-fix blueprints.py was #1 at 0.703 but that was a
        // degenerate collapse; now #1 or #2 is healthy).
        let focus_file_rank = response
            .files
            .iter()
            .find(|f| {
                f.lsp_location.file_path.contains("blueprints.py")
                    && !f.lsp_location.file_path.contains("test_")
                    && !f.lsp_location.file_path.contains("sansio")
            })
            .map(|f| f.rank)
            .unwrap_or(0.0);
        let focus_position = response
            .files
            .iter()
            .position(|f| {
                f.lsp_location.file_path.contains("blueprints.py")
                    && !f.lsp_location.file_path.contains("test_")
                    && !f.lsp_location.file_path.contains("sansio")
            })
            .unwrap_or(usize::MAX);
        eprintln!(
            "\nblueprinets.py position: #{} rank={:.6}",
            focus_position + 1,
            focus_file_rank
        );
        assert!(
            focus_position < 3,
            "blueprints.py must be in top-3 focused results (got position {}); \
             soft personalization must rebias toward focus neighborhood — I#16",
            focus_position + 1
        );

        // Criterion 3: next 5 non-focus files have rank >= 10% of the top
        // file's rank.  This is the core dispersion check: no more Dirac-delta
        // collapse where one file is 0.703 and all others are 0.003.
        let top_rank = response.files[0].rank;
        let non_focus_min_5 = response
            .files
            .iter()
            .filter(|f| {
                !(f.lsp_location.file_path.contains("blueprints.py")
                    && !f.lsp_location.file_path.contains("test_")
                    && !f.lsp_location.file_path.contains("sansio"))
            })
            .take(5)
            .map(|f| f.rank)
            .fold(f32::INFINITY, f32::min);
        let pct = non_focus_min_5 / top_rank * 100.0;
        eprintln!(
            "\nNext-5 (non-focus) min rank: {non_focus_min_5:.6} = {pct:.1}% of top rank {top_rank:.6}"
        );
        assert!(
            pct >= 10.0,
            "next-5 non-focus files min rank is {pct:.1}% of top rank (need ≥ 10%); \
             files are collapsing to near-zero floor — I#16"
        );

        // Criterion 4: neighborhood quality — related files present.
        let related_names = ["app.py", "scaffold.py", "sansio"];
        let found_related: Vec<&str> = related_names
            .iter()
            .copied()
            .filter(|name| {
                response
                    .files
                    .iter()
                    .any(|f| f.lsp_location.file_path.contains(name))
            })
            .collect();
        eprintln!("\nNeighborhood quality: found related files: {found_related:?}");
        // At least one related file should appear (soft assertion — log if missing).
        if found_related.is_empty() {
            eprintln!(
                "WARNING: no expected related files (app.py, scaffold.py) found in neighborhood"
            );
        }
    }

    #[test]
    #[ignore = "runs on full ripvec codebase; use --nocapture to see output"]
    fn test_full_repo_map() {
        use std::time::Instant;

        let root = Path::new(env!("CARGO_MANIFEST_DIR"))
            .parent()
            .unwrap()
            .parent()
            .unwrap();

        // Phase 1: build_graph (walk + parse + import resolve + PageRank)
        let t0 = Instant::now();
        let graph = build_graph(root).expect("build_graph on ripvec root");
        let build_ms = t0.elapsed().as_secs_f64() * 1000.0;

        // Phase 2: render (default, no focus)
        let t1 = Instant::now();
        let rendered = render(&graph, 2000, None);
        let render_ms = t1.elapsed().as_secs_f64() * 1000.0;

        // Phase 3: render (topic-sensitive, focused on highest-ranked file)
        let t2 = Instant::now();
        let focus_idx = graph
            .base_ranks
            .iter()
            .enumerate()
            .max_by(|a, b| a.1.total_cmp(b.1))
            .map(|(i, _)| i);
        let focused = render(&graph, 2000, focus_idx);
        let focus_ms = t2.elapsed().as_secs_f64() * 1000.0;

        eprintln!("\n=== Repo Map Performance ===");
        eprintln!(
            "Files: {}, Edges: {}, Defs: {}",
            graph.files.len(),
            graph.edges.len(),
            graph.files.iter().map(|f| f.defs.len()).sum::<usize>()
        );
        eprintln!("build_graph:     {build_ms:.1}ms (walk + parse + resolve + PageRank)");
        eprintln!(
            "render(default): {render_ms:.3}ms ({} chars, ~{} tokens)",
            rendered.len(),
            rendered.len() / 4
        );
        eprintln!(
            "render(focused): {focus_ms:.3}ms ({} chars, ~{} tokens)",
            focused.len(),
            focused.len() / 4
        );

        eprintln!("\nTop 5 by PageRank:");
        let mut ranked: Vec<(usize, f32)> = graph.base_ranks.iter().copied().enumerate().collect();
        ranked.sort_by(|a, b| b.1.total_cmp(&a.1));
        for (i, rank) in ranked.iter().take(5) {
            eprintln!("  {:.4} {}", rank, graph.files[*i].path);
        }

        eprintln!("\n=== Default Render ===\n{rendered}");
        eprintln!(
            "\n=== Focused Render (on {}) ===\n{focused}",
            focus_idx
                .map(|i| graph.files[i].path.as_str())
                .unwrap_or("none")
        );
    }
}