repotoire 0.8.0

//! Phase 1c: graph-aware enrichment of `Finding::prediction_reasons`.
//!
//! This module is the third step of the dual-branch infrastructure
//! (Phase 1c in the architecture note at
//! `docs/superpowers/specs/2026-05-09-dual-branch-phase1-architecture.md`).
//!
//! # What this does
//!
//! For every finding that points to a known graph location, attach two
//! kinds of typed `PredictionReason` derived from the graph:
//!
//! - `EnclosingScope { scope_kind, name }` — the function or class the
//!   finding sits inside, found by `GraphQuery::function_at_idx`.
//! - `ImportPresence { module }` — one reason per module the finding's
//!   file imports, found by `GraphQuery::node_by_name_idx` +
//!   `importees_idx`.
//!
//! # What this does NOT do
//!
//! - **Does not change severity or confidence.** Phase 1 is "collect,
//!   don't decide" (per the dual-branch RFC). Severity changes happen
//!   in per-detector predictor logic shipped in Phase 2.
//! - **Does not assign weights.** The `weight` field on every reason
//!   produced here is `0.0`. Phase 2 detectors that consume these
//!   reasons apply category-specific weights (e.g. a security detector
//!   knows `subprocess` matters more than `unittest`; a dead-code
//!   detector treats them the same). Putting weight assignment here
//!   would force this module to know every detector's domain rules,
//!   which is the wrong layering.
//! - **Does not re-run `PredictiveCodingEngine`.** The architecture
//!   note suggests Phase 1c could supply hierarchical evidence per
//!   finding by reusing the engine. That's deferred: the engine is
//!   currently constructed *inside* `HierarchicalSurprisalDetector::detect`
//!   and dropped after, so reuse requires hoisting it into the
//!   pipeline (a separable refactor with its own decisions about
//!   ownership, lifetime, and cache invalidation). Phase 1c ships the
//!   smaller, additive piece; engine-hoisting is its own phase.
//!
//! # Deviation from the architecture note's gating
//!
//! The architecture note says this pass should run "for each finding
//! with `alternative_branch.is_some()`." In Phase 1, no detector emits
//! `alternative_branch` (that's Phase 2). Strictly applying the
//! spec'd gate would ship dead code until Phase 2 detector migrations
//! land.
//!
//! Instead, this pass enriches *every* finding with a graph location.
//! Reasons:
//!
//! 1. **Cost is negligible.** Two indexed graph lookups per finding;
//!    both are O(1)-to-O(small) on pre-built indexes.
//! 2. **Better data flow.** Graph evidence becomes *input* to Phase 2
//!    detectors' dual-branch decisions, not a downstream decoration of
//!    decisions already made.
//! 3. **No dead code.** Every line that runs is exercised by every
//!    scan, so test coverage and behavior stay aligned.
//! 4. **The Phase 1d feature flag** (separate, not yet shipped) gates
//!    the *whole* Phase 1 system at the postprocess level. Per-finding
//!    gating inside this pass would be the wrong layer.
//!
//! # JSON output size
//!
//! For a file with N imports, this pass adds 1 + N reasons per finding
//! in that file (1 `EnclosingScope` + N `ImportPresence`). On a typical
//! Python file with 10 imports across hundreds of findings, that's a
//! non-trivial amount of JSON (~1KB per finding). No cap is applied:
//!
//! - Capping at an arbitrary number (e.g. "first 10 imports") would
//!   silently drop information Phase 2 detectors might need to query
//!   ("does this file import `subprocess`?").
//! - The Phase 1d feature flag (separate, not yet shipped) gates the
//!   whole Phase 1 system at the postprocess level. If JSON size
//!   becomes a real problem before Phase 2 lands, disable the flag.
//! - If a real user reports JSON size as a problem post-flag, the
//!   right fix is probably to skip emission for findings whose
//!   detector isn't dual-branch-migrated yet, not to cap the import
//!   list.
//!
//! # Idempotence
//!
//! Calling `enrich_graph_evidence` twice on the same findings produces
//! the same `prediction_reasons` content twice (i.e. duplicates). The
//! pipeline calls it exactly once after `enrich_confidence`. If a
//! future pipeline change needs to re-run it, dedup logic will need to
//! be added — this is documented here rather than implemented because
//! adding dedup would require a notion of "reason identity" that's not
//! yet needed.

use crate::dual_branch::{PredictionReason, PredictionReasonKind};
use crate::graph::store_models::NodeKind;
use crate::graph::GraphQuery;
use crate::models::Finding;

/// Stable string label for a `NodeKind`, used in
/// `EnclosingScope::scope_kind`.
///
/// Decoupled from the `Debug` derive on purpose: the derive could
/// change shape silently (e.g. wrapping in a newtype), and the
/// serialized JSON of every Phase 1c reason depends on this string
/// being stable across versions for downstream consumers (Phase 2
/// detectors that match on it).
fn node_kind_label(kind: NodeKind) -> &'static str {
    match kind {
        NodeKind::File => "File",
        NodeKind::Function => "Function",
        NodeKind::Class => "Class",
        NodeKind::Module => "Module",
        NodeKind::Variable => "Variable",
        NodeKind::Commit => "Commit",
    }
}

/// Counters returned by `enrich_graph_evidence` for telemetry.
///
/// Used by the pass to emit a single `tracing::debug` summary per
/// scan — enough to detect a "silent no-op" failure mode (e.g. a
/// path-format mismatch where the graph stores `src/foo.py` but
/// findings carry `/abs/path/src/foo.py`, leading to 0% hit rate).
///
/// Public for the test in this module that asserts the pass populates
/// the counters; not exported beyond the crate.
#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
pub(crate) struct GraphEnrichmentStats {
    /// Total findings examined.
    pub examined: usize,
    /// Findings that gained at least one `EnclosingScope` reason.
    pub enclosing_scope_hits: usize,
    /// Findings whose file resolved to a graph node (gates ImportPresence).
    pub file_node_hits: usize,
    /// Total `ImportPresence` reasons attached across all findings.
    pub import_reasons_attached: usize,
}

/// Enrich every finding's `prediction_reasons` with graph-derived
/// evidence. See module-level docs for what is and isn't done.
///
/// Mutates `findings` in place. Findings whose location can't be
/// resolved in the graph (no file node, no enclosing function) are
/// left untouched — graph misses are silent at the per-finding level
/// but counted in the per-scan `tracing::debug` summary so a
/// path-format mismatch surfaces as "0 file_node_hits out of N
/// examined" instead of staying invisible.
pub fn enrich_graph_evidence(findings: &mut [Finding], graph: &dyn GraphQuery) {
    let mut stats = GraphEnrichmentStats {
        examined: findings.len(),
        ..Default::default()
    };
    for finding in findings.iter_mut() {
        enrich_one(finding, graph, &mut stats);
    }
    if stats.examined > 0 {
        // Emit even on zero hits — that's the most important signal
        // (silent no-op caused by path-format mismatch).
        tracing::debug!(
            "Graph evidence enrichment: {} findings examined, \
             {} got EnclosingScope, {} resolved to a file node, \
             {} ImportPresence reasons attached",
            stats.examined,
            stats.enclosing_scope_hits,
            stats.file_node_hits,
            stats.import_reasons_attached,
        );
    }
}

fn enrich_one(finding: &mut Finding, graph: &dyn GraphQuery, stats: &mut GraphEnrichmentStats) {
    // The bridge keys off the first affected file + line_start. A
    // finding may touch multiple files; we use the first by
    // convention (matches what `confidence_enrichment` does).
    let Some(file_path_buf) = finding.affected_files.first() else {
        return;
    };
    let file_path = file_path_buf.to_string_lossy();
    let file_path_str = file_path.as_ref();

    // ── Reason 1: EnclosingScope ────────────────────────────────────
    //
    // Walk to the enclosing function via the graph's pre-built
    // (file_path, line) → NodeIndex index. If the finding has no
    // line_start, we skip — there's no anchor to walk from.
    if let Some(line) = finding.line_start {
        if let Some(fn_idx) = graph.function_at_idx(file_path_str, line) {
            if let Some(node) = graph.node_idx(fn_idx) {
                let qn = graph.interner().resolve(node.qualified_name).to_string();
                let scope_kind = node_kind_label(node.kind).to_string();
                let note = format!(
                    "inside {} {} (lines {}-{})",
                    scope_kind, qn, node.line_start, node.line_end
                );
                finding.prediction_reasons.push(PredictionReason {
                    kind: PredictionReasonKind::EnclosingScope {
                        scope_kind,
                        name: qn,
                    },
                    // See module-level docs: Phase 1c ships
                    // weight = 0.0 evidence; Phase 2 detectors apply
                    // category-specific weights when consuming.
                    weight: 0.0,
                    note,
                });
                stats.enclosing_scope_hits += 1;
            }
        }
    }

    // ── Reasons 2..N: ImportPresence ────────────────────────────────
    //
    // For files in the graph, emit one ImportPresence reason per
    // imported module, in **alphabetical order** for determinism.
    // The graph's `importees_idx` returns a slice in graph-build
    // order, which is non-deterministic across parallel parses;
    // sorting here guarantees byte-identical JSON output for the
    // same input regardless of how the graph was built.
    if let Some((file_idx, _)) = graph.node_by_name_idx(file_path_str) {
        stats.file_node_hits += 1;

        // Resolve names first so we can sort.
        let mut import_names: Vec<String> = graph
            .importees_idx(file_idx)
            .iter()
            .filter_map(|&idx| graph.node_idx(idx))
            .map(|node| graph.interner().resolve(node.qualified_name).to_string())
            .collect();
        import_names.sort_unstable();

        for module in import_names {
            let note = format!("file imports `{}`", module);
            finding.prediction_reasons.push(PredictionReason {
                kind: PredictionReasonKind::ImportPresence { module },
                weight: 0.0,
                note,
            });
            stats.import_reasons_attached += 1;
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::graph::builder::GraphBuilder;
    use crate::graph::store_models::{CodeEdge, NodeKind};
    use crate::graph::CodeNode;
    use crate::models::Severity;
    use std::path::PathBuf;

    /// Build a minimal graph with one file, one function in that file
    /// (lines 10-20), and one import edge (file → external module).
    /// Returns the frozen `CodeGraph`.
    ///
    /// All test cases use this same fixture so a future graph-API
    /// change has exactly one place to update.
    fn build_test_graph() -> crate::graph::CodeGraph {
        let mut b = GraphBuilder::new();
        let py = b.interner().intern("python");
        let empty = b.interner().empty_key();

        // File node — qualified_name == file path, by graph convention.
        let file_qn = "src/foo.py";
        let file_qn_key = b.interner().intern(file_qn);
        b.add_node(CodeNode {
            kind: NodeKind::File,
            name: file_qn_key,
            qualified_name: file_qn_key,
            file_path: file_qn_key,
            language: py,
            line_start: 1,
            line_end: 100,
            complexity: 0,
            param_count: 0,
            method_count: 0,
            field_count: 0,
            max_nesting: 0,
            return_count: 0,
            commit_count: 0,
            flags: 0,
        });

        // Function node — lines 10-20 inside src/foo.py.
        let fn_qn = "src/foo.py::do_thing";
        let fn_qn_key = b.interner().intern(fn_qn);
        let fn_name_key = b.interner().intern("do_thing");
        b.add_node(CodeNode {
            kind: NodeKind::Function,
            name: fn_name_key,
            qualified_name: fn_qn_key,
            file_path: file_qn_key,
            language: py,
            line_start: 10,
            line_end: 20,
            complexity: 5,
            param_count: 2,
            method_count: 0,
            field_count: 0,
            max_nesting: 0,
            return_count: 0,
            commit_count: 0,
            flags: 0,
        });

        // Imported module node. External package — no language, no file
        // path of its own; everything else zeroed.
        let imp_qn = "requests";
        let imp_qn_key = b.interner().intern(imp_qn);
        b.add_node(CodeNode {
            kind: NodeKind::Module,
            name: imp_qn_key,
            qualified_name: imp_qn_key,
            file_path: empty,
            language: empty,
            line_start: 0,
            line_end: 0,
            complexity: 0,
            param_count: 0,
            method_count: 0,
            field_count: 0,
            max_nesting: 0,
            return_count: 0,
            commit_count: 0,
            flags: 0,
        });

        // Edges: file Contains function, file Imports module.
        b.add_edge_by_name(file_qn, fn_qn, CodeEdge::contains());
        b.add_edge_by_name(file_qn, imp_qn, CodeEdge::imports());

        b.freeze()
    }

    fn make_finding_at(file: &str, line: u32) -> Finding {
        Finding {
            id: "f".into(),
            detector: "TestDetector".into(),
            severity: Severity::Medium,
            affected_files: vec![PathBuf::from(file)],
            line_start: Some(line),
            ..Default::default()
        }
    }

    #[test]
    fn enclosing_scope_attached_when_finding_inside_function() {
        let graph = build_test_graph();
        let mut findings = vec![make_finding_at("src/foo.py", 15)];
        enrich_graph_evidence(&mut findings, &graph);

        let has_enclosing = findings[0].prediction_reasons.iter().any(|r| {
            matches!(
                &r.kind,
                PredictionReasonKind::EnclosingScope { name, .. } if name == "src/foo.py::do_thing"
            )
        });
        assert!(
            has_enclosing,
            "finding at src/foo.py:15 must get EnclosingScope=do_thing; got reasons: {:?}",
            findings[0].prediction_reasons,
        );
    }

    #[test]
    fn no_enclosing_scope_when_line_outside_any_function() {
        let graph = build_test_graph();
        // Line 99 is past the end of `do_thing` (lines 10-20).
        let mut findings = vec![make_finding_at("src/foo.py", 99)];
        enrich_graph_evidence(&mut findings, &graph);

        let has_enclosing = findings[0]
            .prediction_reasons
            .iter()
            .any(|r| matches!(r.kind, PredictionReasonKind::EnclosingScope { .. }));
        assert!(
            !has_enclosing,
            "line outside any function must not get EnclosingScope; got reasons: {:?}",
            findings[0].prediction_reasons,
        );
    }

    #[test]
    fn no_enclosing_scope_when_finding_has_no_line_start() {
        let graph = build_test_graph();
        let mut f = make_finding_at("src/foo.py", 0);
        f.line_start = None;
        let mut findings = vec![f];
        enrich_graph_evidence(&mut findings, &graph);

        let has_enclosing = findings[0]
            .prediction_reasons
            .iter()
            .any(|r| matches!(r.kind, PredictionReasonKind::EnclosingScope { .. }));
        assert!(
            !has_enclosing,
            "finding without line_start must not get EnclosingScope (no anchor to walk from)",
        );
    }

    #[test]
    fn import_presence_attached_for_each_imported_module() {
        let graph = build_test_graph();
        let mut findings = vec![make_finding_at("src/foo.py", 15)];
        enrich_graph_evidence(&mut findings, &graph);

        let has_import = findings[0].prediction_reasons.iter().any(|r| {
            matches!(
                &r.kind,
                PredictionReasonKind::ImportPresence { module } if module == "requests"
            )
        });
        assert!(
            has_import,
            "src/foo.py imports `requests`; expected ImportPresence reason. Got: {:?}",
            findings[0].prediction_reasons,
        );
    }

    #[test]
    fn finding_with_no_affected_files_is_left_untouched() {
        let graph = build_test_graph();
        let mut f = make_finding_at("src/foo.py", 15);
        f.affected_files.clear();
        let before = f.prediction_reasons.len();
        let mut findings = vec![f];
        enrich_graph_evidence(&mut findings, &graph);
        assert_eq!(
            findings[0].prediction_reasons.len(),
            before,
            "finding with no affected_files must be left untouched",
        );
    }

    #[test]
    fn file_not_in_graph_skips_silently() {
        let graph = build_test_graph();
        let mut findings = vec![make_finding_at("src/not_in_graph.py", 5)];
        let before = findings[0].prediction_reasons.len();
        enrich_graph_evidence(&mut findings, &graph);
        // No EnclosingScope (no function at that location), no
        // ImportPresence (no file node). Should be a no-op.
        assert_eq!(
            findings[0].prediction_reasons.len(),
            before,
            "file not in graph must produce no reasons (silent skip); got: {:?}",
            findings[0].prediction_reasons,
        );
    }

    #[test]
    fn weight_is_zero_for_all_phase_1c_reasons() {
        // Pinned policy: Phase 1c ships weight = 0.0 evidence. See
        // module-level docs for why (Phase 2 detectors apply
        // category-specific weights). If this test fails, the
        // module's documented contract has changed and the docs
        // need updating in the same commit.
        let graph = build_test_graph();
        let mut findings = vec![make_finding_at("src/foo.py", 15)];
        enrich_graph_evidence(&mut findings, &graph);

        for reason in &findings[0].prediction_reasons {
            assert_eq!(
                reason.weight, 0.0,
                "Phase 1c reason {:?} must have weight 0.0; got {}",
                reason.kind, reason.weight,
            );
        }
    }

    #[test]
    fn enrich_is_additive_preserves_existing_reasons() {
        let graph = build_test_graph();
        let mut f = make_finding_at("src/foo.py", 15);
        let pre_existing = PredictionReason {
            kind: PredictionReasonKind::Custom {
                description: "from-earlier-pass".into(),
            },
            weight: 0.5,
            note: "must survive".into(),
        };
        f.prediction_reasons.push(pre_existing.clone());
        let mut findings = vec![f];
        enrich_graph_evidence(&mut findings, &graph);

        // Pre-existing reason still present and at the front.
        assert_eq!(
            findings[0].prediction_reasons[0], pre_existing,
            "earlier reasons must be preserved; got: {:?}",
            findings[0].prediction_reasons,
        );
        assert!(
            findings[0].prediction_reasons.len() > 1,
            "graph enrichment must add at least one reason on top of the existing",
        );
    }

    /// Build a graph with a file importing two modules in
    /// reverse-alphabetical insertion order, to verify the pass sorts
    /// import names before emitting reasons.
    fn build_graph_with_unsorted_imports() -> crate::graph::CodeGraph {
        let mut b = GraphBuilder::new();
        let py = b.interner().intern("python");
        let empty = b.interner().empty_key();

        let file_qn = "src/bar.py";
        let file_qn_key = b.interner().intern(file_qn);
        b.add_node(CodeNode {
            kind: NodeKind::File,
            name: file_qn_key,
            qualified_name: file_qn_key,
            file_path: file_qn_key,
            language: py,
            line_start: 1,
            line_end: 50,
            complexity: 0,
            param_count: 0,
            method_count: 0,
            field_count: 0,
            max_nesting: 0,
            return_count: 0,
            commit_count: 0,
            flags: 0,
        });

        // Add two import targets in reverse order ("z..." before
        // "a..."). If the pass emits in graph-insertion order, the
        // resulting reasons will be ["zebra", "alpha"]; if sorted,
        // ["alpha", "zebra"].
        for name in ["zebra", "alpha"] {
            let key = b.interner().intern(name);
            b.add_node(CodeNode {
                kind: NodeKind::Module,
                name: key,
                qualified_name: key,
                file_path: empty,
                language: empty,
                line_start: 0,
                line_end: 0,
                complexity: 0,
                param_count: 0,
                method_count: 0,
                field_count: 0,
                max_nesting: 0,
                return_count: 0,
                commit_count: 0,
                flags: 0,
            });
            b.add_edge_by_name(file_qn, name, CodeEdge::imports());
        }

        b.freeze()
    }

    #[test]
    fn import_presence_emitted_in_alphabetical_order() {
        // Pinned policy: ImportPresence reasons are sorted
        // alphabetically by module name. The graph's importees_idx
        // returns slices in graph-build order, which is
        // non-deterministic across parallel parses; sorting in this
        // pass guarantees byte-identical JSON for the same input
        // regardless of build order. If this test fails, downstream
        // JSON diffing and content-keyed caches will silently break.
        let graph = build_graph_with_unsorted_imports();
        let mut findings = vec![make_finding_at("src/bar.py", 1)];
        enrich_graph_evidence(&mut findings, &graph);

        let import_modules: Vec<&str> = findings[0]
            .prediction_reasons
            .iter()
            .filter_map(|r| match &r.kind {
                PredictionReasonKind::ImportPresence { module } => Some(module.as_str()),
                _ => None,
            })
            .collect();

        assert_eq!(
            import_modules,
            vec!["alpha", "zebra"],
            "ImportPresence reasons must be alphabetically sorted; \
             got {:?}",
            import_modules,
        );
    }

    #[test]
    fn enclosing_scope_kind_label_is_stable_string() {
        // Pinned policy: scope_kind is a stable string label, NOT
        // the Debug derive of NodeKind. If this test fails because
        // the value changed, do NOT update the assertion to match —
        // either restore the previous label or version-bump the
        // serialized format with a documented migration.
        let graph = build_test_graph();
        let mut findings = vec![make_finding_at("src/foo.py", 15)];
        enrich_graph_evidence(&mut findings, &graph);

        let scope_kind = findings[0]
            .prediction_reasons
            .iter()
            .find_map(|r| match &r.kind {
                PredictionReasonKind::EnclosingScope { scope_kind, .. } => Some(scope_kind.clone()),
                _ => None,
            })
            .expect("EnclosingScope reason should be present");

        assert_eq!(
            scope_kind, "Function",
            "scope_kind for a function-typed node must be the stable string \
             \"Function\" (decoupled from NodeKind's Debug derive); got {:?}",
            scope_kind,
        );
    }

    #[test]
    fn node_kind_label_covers_all_variants() {
        // Forcing function: if a future change adds a NodeKind
        // variant, the match in `node_kind_label` will fail to
        // compile (non-exhaustive match) before this test runs. The
        // test exists to document that this coverage is intentional.
        // Using assert_ne to avoid encoding the exact strings here
        // (which would be a duplicate of the function under test).
        for kind in [
            NodeKind::File,
            NodeKind::Function,
            NodeKind::Class,
            NodeKind::Module,
            NodeKind::Variable,
            NodeKind::Commit,
        ] {
            let label = node_kind_label(kind);
            assert!(
                !label.is_empty(),
                "node_kind_label({:?}) must return non-empty string; got {:?}",
                kind,
                label,
            );
        }
    }
}