repotoire 0.9.0

//! Dual-branch predictor for Python XXE parser sites.
//!
//! Implements decisions D1 (weights, with safe-by-construction
//! inversion) and D3 (severity) from
//! `docs/superpowers/specs/2026-05-09-dual-branch-phase2-xxe-decisions.md`.
//!
//! # What this module does
//!
//! Given a Python XML-parser call site (`etree.parse(...)`,
//! `XMLParser(...)`, `ET.parse(...)`, `defusedxml.ElementTree.parse(...)`),
//! produce a [`Prediction`] that:
//!
//! 1. Picks `RealBug` or `Benign` as the predicted branch.
//! 2. Carries the other branch as the alternative.
//! 3. Lists typed [`PredictionReason`]s the predictor used.
//! 4. Optionally lists [`ResolutionSignal`]s (collapsing or hint-grade).
//!
//! # The safe-by-construction inversion (D1)
//!
//! Phases 2a–2d shared an "argument-origin" model: the detector flagged
//! a sink and the predictor asked "is the argument attacker-controlled?".
//! XXE inverts this: the sink is always the same shape (an XML parser
//! call); what varies is whether the *parser* is configured to refuse
//! external entities. So the predictor asks: "is this parser configured
//! securely?".
//!
//! The **sign convention is preserved** (positive → Benign, negative →
//! RealBug). What changes is the *lexicon* of which imports/kwargs/scopes
//! count.
//!
//! # Sign convention
//!
//! `weight > 0` leans **Benign**; `weight < 0` leans **RealBug**.
//!
//! # Severity mapping (D3)
//!
//! - Predicted **RealBug** → `Severity::Critical` if user input flows
//!   to the parser, else `Severity::High`. This preserves the existing
//!   single-branch calibration at the old `xxe.rs:322-327`.
//! - Predicted **Benign** → `Severity::Info`.
//! - Alternative branch carries the opposite label's severity.
//!
//! # Resolution signals (collapsing)
//!
//! Two annotations fully collapse the prediction:
//!
//! - `# repotoire: xxe-safe[<reason>]` → `Benign` (Info).
//! - `# repotoire: xxe-vulnerable[<source>]` → `RealBug` (severity from
//!   the existing High/Critical logic).
//!
//! # Why these weights
//!
//! See decision **D1** (with honest-review notes on the +0.30 defusedxml
//! weight choice and the lxml split). Numbers tagged `TUNABLE`.
//! Phase 3 misprediction logging is the right place to retune.

use super::annotation::parse_python_comment;
use crate::dual_branch::{
    AlternativeBranch, BranchLabel, PredictionReason, PredictionReasonKind, ResolutionKind,
    ResolutionSignal,
};
use crate::models::Severity;

// ─────────────────────────────────────────────────────────────────────────────
// XmlApi — the parser-API enum for XXE
// ─────────────────────────────────────────────────────────────────────────────

/// Which Python XML parser API the call site uses. Distinguishes the
/// safe-by-default family (`defusedxml.*`) from the configurable
/// (`lxml.etree`) from the unsafe-by-default (`xml.etree.*`,
/// `xml.sax.*`, `xml.dom.*`).
///
/// Note: unlike `CommandApi`, this enum does NOT carry a `severity_for`
/// 2D table. XXE severity is one-dimensional: Critical if user input
/// flows to the parser, else High (per old `xxe.rs:322-327`). Branch
/// severity is computed by `severity_for_branch` below.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(super) enum XmlApi {
    /// `defusedxml.ElementTree`, `defusedxml.lxml`, `defusedxml.sax`,
    /// `defusedxml.minidom`, etc. Safe-by-default.
    Defusedxml,
    /// `lxml.etree` — capable of safe use via `resolve_entities=False`
    /// / `no_network=True` kwargs, but not safe by default.
    LxmlEtree,
    /// `xml.etree.ElementTree` (stdlib). Documented as unsafe at
    /// https://docs.python.org/3/library/xml.html#xml-vulnerabilities
    /// for Python ≤ 3.11.
    StdlibElementTree,
    /// `xml.sax`, `xml.dom`, `xml.dom.minidom`, `xml.dom.pulldom`.
    /// Stdlib, unsafe-by-default.
    StdlibOther,
    /// Some other parser we recognized but can't classify (e.g.
    /// `xml2js`-like patterns that slipped past the language filter).
    /// No signal contribution.
    Unknown,
}

impl XmlApi {
    /// Human-readable label for the API used in titles/descriptions.
    pub(super) fn callee_label(self) -> &'static str {
        match self {
            XmlApi::Defusedxml => "defusedxml",
            XmlApi::LxmlEtree => "lxml.etree",
            XmlApi::StdlibElementTree => "xml.etree.ElementTree",
            XmlApi::StdlibOther => "xml.sax / xml.dom",
            XmlApi::Unknown => "XML parser",
        }
    }

    /// True iff the API is unsafe-by-default at the stdlib level. Used
    /// by the predictor to fire the `StructuralPattern { description:
    /// "parser uses stdlib xml.etree without protection" }` signal.
    pub(super) fn is_stdlib_unsafe(self) -> bool {
        matches!(self, XmlApi::StdlibElementTree | XmlApi::StdlibOther)
    }

    /// True iff the API is a Python XML library (gates the Phase 2e
    /// dual-branch emission path: only Python sites get the
    /// predictor-aware shape; JS/Java/etc. still go through the
    /// legacy regex scanner per decisions D4).
    pub(super) fn is_python(self) -> bool {
        matches!(
            self,
            XmlApi::Defusedxml
                | XmlApi::LxmlEtree
                | XmlApi::StdlibElementTree
                | XmlApi::StdlibOther
        )
    }
}

// ─────────────────────────────────────────────────────────────────────────────
// Tunable weights
// ─────────────────────────────────────────────────────────────────────────────

// TUNABLE: see Phase 3 misprediction logging.
//
// Sign convention: positive leans Benign, negative leans RealBug.
//
// Calibration target (per decisions doc D1 worked examples):
//   * lxml + `resolve_entities=False` + `no_network=True`:
//     +0.10 + 0.40 + 0.30 = +0.80 → strongly Benign.
//   * Stdlib xml.etree in a Flask handler reading `request.data`:
//     -0.20 + -0.50 + -0.30 = -1.00 → strongly RealBug.
//   * Unused `import defusedxml` + stdlib parse, no negative signals:
//     +0.30 + -0.20 = +0.10 → tiebreak-close Benign (documented
//     v0 false-positive; see D5 #1).
//   * Unused `import defusedxml` + stdlib parse in handler reading
//     request: +0.30 + -0.20 + -0.50 + -0.30 = -0.70 → correctly RealBug.

/// `defusedxml` module imported anywhere in the file. Weak Benign
/// signal: import is *capable* of being the safe pattern but doesn't
/// rule out a stdlib parse in the same file (D5 #1 v0 limitation).
const W_IMPORT_DEFUSEDXML: f32 = 0.30;

/// `lxml.etree` imported. Even weaker signal: lxml is capable of safe
/// use but not safe-by-default. Pairs with the kwarg signals.
const W_IMPORT_LXML_ETREE: f32 = 0.10;

/// `resolve_entities=False` kwarg on the XMLParser constructor.
/// The decisive lxml protection kwarg.
const W_KW_RESOLVE_ENTITIES_FALSE: f32 = 0.40;

/// `no_network=True` kwarg. Closes the SSRF-via-XXE leg.
const W_KW_NO_NETWORK_TRUE: f32 = 0.30;

/// `forbid_dtd=True` kwarg (defusedxml explicit opt-in).
const W_KW_FORBID_DTD_TRUE: f32 = 0.40;

/// Enclosing function looks like a test fixture.
const W_ENCLOSING_TEST_FUNCTION: f32 = 0.15;

/// User input (request body / uploaded data / etc.) flows to the
/// parser within a 10-line lookback window (carry-over of the
/// existing `has_user_input_flow` heuristic at old `xxe.rs:181`).
const W_USER_INPUT_FLOW: f32 = -0.50;

/// Enclosing function looks like a request handler.
const W_ENCLOSING_HANDLER: f32 = -0.30;

/// The parser API is a stdlib XML module known to be unsafe by
/// default (`xml.etree`, `xml.sax`, `xml.dom`). Documented at
/// https://docs.python.org/3/library/xml.html#xml-vulnerabilities.
const W_STDLIB_UNSAFE_PARSER: f32 = -0.20;

/// The parser API IS `defusedxml` (i.e. the call site is literally
/// `defusedxml.ElementTree.parse(...)` or `ET.parse(...)` resolved
/// through the alias map to `defusedxml.*`). Symmetric counterpart
/// to `W_STDLIB_UNSAFE_PARSER`: stronger than the file-scoped
/// `W_IMPORT_DEFUSEDXML` because it pins the actual call (not just
/// "defusedxml is in scope somewhere"). This is the safe-by-
/// construction signal that D1's spec-inversion is built around.
///
/// Honest review note (2026-05-11): D1's initial weights table
/// included only `ImportPresence{defusedxml}: +0.30` and was missing
/// this call-site signal. The integration commit's
/// `flag_on_defusedxml_classifies_benign` test surfaced the gap —
/// without this signal, `defusedxml.ElementTree.parse(req.data)`
/// inside a handler scored -0.50 (RealBug), defeating the whole
/// point of using defusedxml.
///
/// Treatment: rather than a +0.50 additive weight (which still
/// loses to handler + user-input combined at -0.80), we treat a
/// defusedxml call as **collapsing to Benign**, same family as the
/// `xxe-safe` annotation. defusedxml *is* the safe-by-construction
/// answer; the input source doesn't matter once entity resolution
/// is disabled at the parser level. The reason weight on the
/// emitted PredictionReason is +1.0 to mirror an annotation
/// collapse.
const W_API_DEFUSEDXML_CALL: f32 = 1.0;

// ─────────────────────────────────────────────────────────────────────────────
// Lexicons used by source-classification helpers
// ─────────────────────────────────────────────────────────────────────────────

/// Substrings that identify user-input flow into XML parsing.
/// Mirrors the existing `USER_INPUT` regex at old `xxe.rs:21` minus
/// the regex-specific anchoring.
const USER_INPUT_SUBSTRINGS: &[&str] = &[
    "req.body",
    "req.file",
    "req.files",
    "request.data",
    "request.body",
    "request.files",
    "request.get_data",
    "uploaded",
    "file_content",
    "getinputstream",
];

/// Substrings that identify test code. Mirrors 2d.
const TEST_FUNCTION_SUBSTRINGS: &[&str] = &["test_", "_test", "fixture", "setup", "teardown"];

/// Substrings that identify a request-handler function. Mirrors 2d.
const HANDLER_FUNCTION_SUBSTRINGS: &[&str] = &[
    "handler",
    "route",
    "endpoint",
    "view",
    "controller",
    "middleware",
    "request",
    "response",
    "upload",
    "import",
    "parse",
];

// ─────────────────────────────────────────────────────────────────────────────
// Evidence
// ─────────────────────────────────────────────────────────────────────────────

/// Structured evidence extracted from an XML-parser call site.
#[derive(Debug, Clone, Default, PartialEq)]
pub(super) struct Evidence {
    /// Which parser API the call site uses.
    pub api: Option<XmlApi>,

    /// Name of the enclosing function, if any.
    pub enclosing_function: Option<String>,

    /// Name of the enclosing class, if any (informational; no weight).
    pub enclosing_class: Option<String>,

    /// `defusedxml` imported anywhere in the file. File-scoped per
    /// the D5 #1 v0 limitation.
    pub import_defusedxml: bool,

    /// `lxml.etree` imported anywhere in the file.
    pub import_lxml_etree: bool,

    /// `resolve_entities=False` kwarg present on the parser
    /// constructor at the call site (or co-located within the same
    /// statement).
    pub kw_resolve_entities_false: bool,

    /// `no_network=True` kwarg present.
    pub kw_no_network_true: bool,

    /// `forbid_dtd=True` kwarg present.
    pub kw_forbid_dtd_true: bool,

    /// User input (request body / uploaded data) flows into the
    /// parser within a 10-line lookback window. Carry-over of the
    /// existing heuristic at old `xxe.rs:181`.
    pub has_user_input_flow: bool,

    /// `Some(reason)` if a `# repotoire: xxe-safe[<reason>]`
    /// annotation appears on the call line. **Collapsing**.
    pub xxe_safe_annotation: Option<String>,

    /// `Some(source)` if a `# repotoire: xxe-vulnerable[<source>]`
    /// annotation appears on the call line. **Collapsing**.
    pub xxe_vulnerable_annotation: Option<String>,
}

impl Evidence {
    #[cfg(test)]
    pub(super) fn empty() -> Self {
        Self::default()
    }
}

// ─────────────────────────────────────────────────────────────────────────────
// Prediction
// ─────────────────────────────────────────────────────────────────────────────

#[derive(Debug, Clone)]
pub(super) struct Prediction {
    pub predicted: BranchLabel,
    pub alternative_branch: AlternativeBranch,
    pub predicted_severity: Severity,
    pub reasons: Vec<PredictionReason>,
    pub resolutions: Vec<ResolutionSignal>,
}

// ─────────────────────────────────────────────────────────────────────────────
// Scorer
// ─────────────────────────────────────────────────────────────────────────────

/// Build a [`Prediction`] from extracted [`Evidence`].
///
/// # Algorithm
///
/// 1. **Collapsing signals first.** If `xxe_safe_annotation` or
///    `xxe_vulnerable_annotation` is set, commit to the corresponding
///    branch with confidence 1.0 and skip weighted scoring.
/// 2. **Weighted scoring.** Sum weights for each present signal.
/// 3. **Tiebreak**: sum exactly 0.0 → predict RealBug. Conservative
///    default for security findings.
///
/// # Severity mapping
///
/// - Predicted RealBug → `Critical` if `has_user_input_flow`, else `High`.
/// - Predicted Benign → `Severity::Info`.
pub(super) fn predict(evidence: &Evidence) -> Prediction {
    let api = evidence.api.unwrap_or(XmlApi::Unknown);
    let api_label = api.callee_label();

    // ── Step 1: collapsing annotations. ──
    if let Some(reason) = &evidence.xxe_safe_annotation {
        return collapse(
            BranchLabel::Benign,
            api,
            evidence.has_user_input_flow,
            ResolutionSignal {
                kind: ResolutionKind::SourceAnnotation {
                    syntax: format!("# repotoire: xxe-safe[{reason}]"),
                },
                description: format!(
                    "`xxe-safe[{reason}]` annotation declares this XML \
                     parse as protected (caller-side validation, XSD \
                     pre-check, etc.); the finding collapses to Info."
                ),
                example: Some(format!("{api_label}(...)  # repotoire: xxe-safe[{reason}]")),
                collapses_to: BranchLabel::Benign,
            },
            PredictionReason {
                kind: PredictionReasonKind::Custom {
                    description: format!("xxe-safe[{reason}] annotation"),
                },
                weight: 1.0,
                note: format!(
                    "Annotated as caller-validated ({reason}); not an \
                     XXE risk."
                ),
            },
        );
    }
    if let Some(source) = &evidence.xxe_vulnerable_annotation {
        return collapse(
            BranchLabel::RealBug,
            api,
            evidence.has_user_input_flow,
            ResolutionSignal {
                kind: ResolutionKind::SourceAnnotation {
                    syntax: format!("# repotoire: xxe-vulnerable[{source}]"),
                },
                description: format!(
                    "`xxe-vulnerable[{source}]` annotation declares this \
                     XML parser as exposed to attacker-controlled XML; \
                     the finding stays at the existing severity."
                ),
                example: Some(format!(
                    "{api_label}(...)  # repotoire: xxe-vulnerable[{source}]"
                )),
                collapses_to: BranchLabel::RealBug,
            },
            PredictionReason {
                kind: PredictionReasonKind::Custom {
                    description: format!("xxe-vulnerable[{source}] annotation"),
                },
                weight: -1.0,
                note: format!("Annotated as XXE-exposed (source: {source})."),
            },
        );
    }

    // ── Step 1.5: collapsing on Defusedxml call. ──
    //
    // A call site on a `defusedxml.*` API is safe-by-construction:
    // entity resolution is disabled at the parser level, so the
    // input source doesn't matter. We treat this as a collapsing
    // signal in the same family as the `xxe-safe` annotation
    // (decisions D1 amendment, 2026-05-11) rather than an additive
    // weight, because handler + user-input combined (-0.80) would
    // otherwise dominate even a +0.50 additive signal.
    if matches!(api, XmlApi::Defusedxml) {
        return collapse(
            BranchLabel::Benign,
            api,
            evidence.has_user_input_flow,
            ResolutionSignal {
                kind: ResolutionKind::StructuralPattern {
                    description: "call on defusedxml API (safe-by-construction)".to_string(),
                },
                description: "`defusedxml.*` parsers are safe-by-construction against XXE: \
                     entity resolution, DTD processing, and external network \
                     fetches are all disabled by default. The input source \
                     (user-controlled or not) is irrelevant once entity \
                     resolution is off at the parser level."
                    .to_string(),
                example: Some(format!("{api_label}(...)")),
                collapses_to: BranchLabel::Benign,
            },
            PredictionReason {
                kind: PredictionReasonKind::StructuralPattern {
                    description: "parse call uses defusedxml (safe-by-construction)".to_string(),
                },
                weight: W_API_DEFUSEDXML_CALL,
                note: "The call site is on a `defusedxml.*` API. defusedxml \
                       disables all entity-resolution defaults, so user-\
                       controlled XML cannot mount an XXE attack. Strongest \
                       Benign signal in the v0 model."
                    .to_string(),
            },
        );
    }

    // ── Step 2: weighted scoring. ──
    let mut sum: f32 = 0.0;
    let mut reasons: Vec<PredictionReason> = Vec::new();

    if evidence.import_defusedxml {
        sum += W_IMPORT_DEFUSEDXML;
        reasons.push(PredictionReason {
            kind: PredictionReasonKind::ImportPresence {
                module: "defusedxml".to_string(),
            },
            weight: W_IMPORT_DEFUSEDXML,
            note: "`defusedxml` is safe-by-default for XML parsing. \
                   The import is weak signal (file-scoped, not call-\
                   scoped per v0); a stdlib parse in the same file can \
                   still flip the verdict to RealBug."
                .to_string(),
        });
    }

    if evidence.import_lxml_etree {
        sum += W_IMPORT_LXML_ETREE;
        reasons.push(PredictionReason {
            kind: PredictionReasonKind::ImportPresence {
                module: "lxml.etree".to_string(),
            },
            weight: W_IMPORT_LXML_ETREE,
            note: "`lxml.etree` is capable of safe configuration via \
                   `resolve_entities=False` / `no_network=True` but is \
                   not safe-by-default. Weak signal on its own; pairs \
                   with the kwarg signals."
                .to_string(),
        });
    }

    if evidence.kw_resolve_entities_false {
        sum += W_KW_RESOLVE_ENTITIES_FALSE;
        reasons.push(PredictionReason {
            kind: PredictionReasonKind::KeywordArgument {
                name: "resolve_entities".to_string(),
                value: "False".to_string(),
            },
            weight: W_KW_RESOLVE_ENTITIES_FALSE,
            note: "`resolve_entities=False` on the lxml parser disables \
                   external entity resolution; the OWASP-recommended \
                   protection against XXE."
                .to_string(),
        });
    }

    if evidence.kw_no_network_true {
        sum += W_KW_NO_NETWORK_TRUE;
        reasons.push(PredictionReason {
            kind: PredictionReasonKind::KeywordArgument {
                name: "no_network".to_string(),
                value: "True".to_string(),
            },
            weight: W_KW_NO_NETWORK_TRUE,
            note: "`no_network=True` closes the SSRF-via-XXE leg by \
                   preventing the parser from fetching external entities \
                   over the network."
                .to_string(),
        });
    }

    if evidence.kw_forbid_dtd_true {
        sum += W_KW_FORBID_DTD_TRUE;
        reasons.push(PredictionReason {
            kind: PredictionReasonKind::KeywordArgument {
                name: "forbid_dtd".to_string(),
                value: "True".to_string(),
            },
            weight: W_KW_FORBID_DTD_TRUE,
            note: "`forbid_dtd=True` is the defusedxml opt-in to refuse \
                   any DOCTYPE declaration; closes the entire XXE class."
                .to_string(),
        });
    }

    if evidence.has_user_input_flow {
        sum += W_USER_INPUT_FLOW;
        reasons.push(PredictionReason {
            kind: PredictionReasonKind::StructuralPattern {
                description: "user input flows to parser within 10 lines".to_string(),
            },
            weight: W_USER_INPUT_FLOW,
            note: "Request body / uploaded data / file content flows \
                   into the parse call within a 10-line lookback window. \
                   Attacker controls the XML being parsed."
                .to_string(),
        });
    }

    if api.is_stdlib_unsafe() {
        sum += W_STDLIB_UNSAFE_PARSER;
        reasons.push(PredictionReason {
            kind: PredictionReasonKind::StructuralPattern {
                description: "parser uses stdlib xml module without protection".to_string(),
            },
            weight: W_STDLIB_UNSAFE_PARSER,
            note: "The stdlib `xml.*` modules are documented as unsafe \
                   by default in Python ≤ 3.11 \
                   (https://docs.python.org/3/library/xml.html#xml-vulnerabilities). \
                   Without explicit hardening, this parser will resolve \
                   external entities."
                .to_string(),
        });
    }

    // Note: a `Defusedxml` call is handled by the Step 1.5 collapse
    // above and never reaches this point. Lxml/StdlibElementTree/
    // StdlibOther/Unknown fall through to additive scoring.

    if let Some(fn_name) = &evidence.enclosing_function {
        if matches_test_function(fn_name) {
            sum += W_ENCLOSING_TEST_FUNCTION;
            reasons.push(PredictionReason {
                kind: PredictionReasonKind::EnclosingScope {
                    scope_kind: "function".to_string(),
                    name: fn_name.clone(),
                },
                weight: W_ENCLOSING_TEST_FUNCTION,
                note: format!(
                    "Enclosing function `{fn_name}` looks like a \
                     test/fixture; test code rarely the actionable \
                     security target."
                ),
            });
        } else if matches_handler_function(fn_name) {
            sum += W_ENCLOSING_HANDLER;
            reasons.push(PredictionReason {
                kind: PredictionReasonKind::EnclosingScope {
                    scope_kind: "request_handler".to_string(),
                    name: fn_name.clone(),
                },
                weight: W_ENCLOSING_HANDLER,
                note: format!(
                    "Enclosing function `{fn_name}` looks like a request \
                     handler (`handler`/`route`/`endpoint`/`view`/\
                     `controller`/`upload`/`parse`); higher prior on \
                     attacker-reachable code."
                ),
            });
        }
    }

    // ── Step 3: tiebreak + severity mapping. ──
    let predicted = if sum > 0.0 {
        BranchLabel::Benign
    } else {
        // Strict 0.0 tiebreak: lean RealBug. Conservative default
        // matching Phase 2a/2b/2c/2d behavior for security findings.
        BranchLabel::RealBug
    };

    build_prediction(
        predicted,
        api,
        evidence.has_user_input_flow,
        reasons,
        Vec::new(),
    )
}

// ─────────────────────────────────────────────────────────────────────────────
// Helpers
// ─────────────────────────────────────────────────────────────────────────────

pub(super) fn matches_user_input(text: &str) -> bool {
    let lower = text.to_lowercase();
    USER_INPUT_SUBSTRINGS.iter().any(|sub| lower.contains(sub))
}

fn matches_test_function(name: &str) -> bool {
    let lower = name.to_lowercase();
    TEST_FUNCTION_SUBSTRINGS
        .iter()
        .any(|sub| lower.contains(sub))
}

/// Matches a request-handler function. Mirrors 2d minus the
/// HTTP-verb-prefix special-case (XXE handlers in the wild don't
/// typically use `getX`/`postX` naming — they use Flask/Django route
/// decorators with named functions like `handle_upload`).
fn matches_handler_function(name: &str) -> bool {
    let lower = name.to_lowercase();
    HANDLER_FUNCTION_SUBSTRINGS
        .iter()
        .any(|sub| lower.contains(sub))
}

fn collapse(
    label: BranchLabel,
    api: XmlApi,
    has_user_input: bool,
    resolution: ResolutionSignal,
    reason: PredictionReason,
) -> Prediction {
    build_prediction(label, api, has_user_input, vec![reason], vec![resolution])
}

fn build_prediction(
    predicted: BranchLabel,
    api: XmlApi,
    has_user_input: bool,
    reasons: Vec<PredictionReason>,
    resolutions: Vec<ResolutionSignal>,
) -> Prediction {
    let api_label = api.callee_label();
    let predicted_severity = severity_for_branch(predicted, has_user_input);
    let alternative_label = predicted.opposite();
    let alternative_severity = severity_for_branch(alternative_label, has_user_input);

    let alternative_branch = AlternativeBranch {
        label: alternative_label,
        severity: alternative_severity,
        title: title_for_branch(alternative_label, api_label),
        description: description_for_branch(alternative_label, api_label),
        suggested_fix: suggested_fix_for_branch(alternative_label, api_label),
    };

    Prediction {
        predicted,
        alternative_branch,
        predicted_severity,
        reasons,
        resolutions,
    }
}

/// D3: RealBug severity is `Critical` if user input flows to the
/// parser, else `High` (mirrors old `xxe.rs:322-327`); Benign → Info.
fn severity_for_branch(label: BranchLabel, has_user_input: bool) -> Severity {
    match label {
        BranchLabel::RealBug => {
            if has_user_input {
                Severity::Critical
            } else {
                Severity::High
            }
        }
        BranchLabel::Benign => Severity::Info,
    }
}

fn title_for_branch(label: BranchLabel, api_label: &str) -> String {
    match label {
        BranchLabel::RealBug => format!("Potential XXE vulnerability in {api_label} parse"),
        BranchLabel::Benign => {
            format!("XML parse via {api_label} appears safely configured (informational)")
        }
    }
}

fn description_for_branch(label: BranchLabel, api_label: &str) -> String {
    match label {
        BranchLabel::RealBug => format!(
            "The `{api_label}` parser appears to be operating without \
             protections against external entity resolution. XXE \
             vulnerabilities allow attackers to read arbitrary files \
             (`file:///etc/passwd`), perform SSRF \
             (`http://internal-server/`), launch denial-of-service \
             attacks (billion laughs), and port-scan internal networks."
        ),
        BranchLabel::Benign => format!(
            "The `{api_label}` parser appears to be either safe-by-\
             default (defusedxml) or explicitly hardened \
             (`resolve_entities=False`, `no_network=True`). The call \
             site is carried as Info; the RealBug interpretation is \
             preserved in `alternative_branch` in case the predictor \
             is wrong."
        ),
    }
}

fn suggested_fix_for_branch(label: BranchLabel, _api_label: &str) -> Option<String> {
    match label {
        BranchLabel::RealBug => Some(
            "Use `defusedxml` (the safe-by-default Python XML library) \
             or explicitly disable external entity resolution on the \
             existing parser. Example: `from defusedxml.ElementTree \
             import parse; parse(blob)`. For lxml: \
             `etree.XMLParser(resolve_entities=False, no_network=True)`."
                .to_string(),
        ),
        BranchLabel::Benign => Some(
            "If this is intentional safe usage, annotate \
             `# repotoire: xxe-safe[<reason>]` to collapse the finding \
             to Info definitively."
                .to_string(),
        ),
    }
}

// ─────────────────────────────────────────────────────────────────────────────
// Annotation lookup helpers (called by evidence extraction)
// ─────────────────────────────────────────────────────────────────────────────

/// If `line` carries `# repotoire: xxe-safe[<reason>]`, return the
/// reason. Defaults to `"unspecified"` if no arg supplied.
pub(super) fn extract_xxe_safe_reason(line: &str) -> Option<String> {
    let ann = parse_python_comment(line)?;
    if ann.kind != "xxe-safe" {
        return None;
    }
    if ann.args.is_empty() {
        Some("unspecified".to_string())
    } else {
        Some(ann.args[0].clone())
    }
}

/// If `line` carries `# repotoire: xxe-vulnerable[<source>]`, return
/// the source. Defaults to `"unspecified"` if no arg supplied.
pub(super) fn extract_xxe_vulnerable_source(line: &str) -> Option<String> {
    let ann = parse_python_comment(line)?;
    if ann.kind != "xxe-vulnerable" {
        return None;
    }
    if ann.args.is_empty() {
        Some("unspecified".to_string())
    } else {
        Some(ann.args[0].clone())
    }
}

// ─────────────────────────────────────────────────────────────────────────────
// Tests
// ───────────���─────────────────────────────────────────────────────────────────

#[cfg(test)]
mod tests {
    use super::*;

    // ─── Worked example 1 (decisions D1): canonical safe lxml. ───
    #[test]
    fn lxml_with_explicit_protection_predicts_benign_info() {
        let evidence = Evidence {
            api: Some(XmlApi::LxmlEtree),
            import_lxml_etree: true,
            kw_resolve_entities_false: true,
            kw_no_network_true: true,
            ..Default::default()
        };
        let p = predict(&evidence);
        assert_eq!(p.predicted, BranchLabel::Benign);
        assert_eq!(p.predicted_severity, Severity::Info);
        // Alternative: RealBug at severity High (no user input).
        assert_eq!(p.alternative_branch.label, BranchLabel::RealBug);
        assert_eq!(p.alternative_branch.severity, Severity::High);
        let total: f32 = p.reasons.iter().map(|r| r.weight).sum();
        assert!(
            (total - (W_IMPORT_LXML_ETREE + W_KW_RESOLVE_ENTITIES_FALSE + W_KW_NO_NETWORK_TRUE))
                .abs()
                < 1e-6,
            "expected +0.80, got {total}"
        );
    }

    // ─── Worked example 2 (decisions D1): canonical RealBug shape. ───
    #[test]
    fn stdlib_etree_with_user_input_in_handler_predicts_realbug_critical() {
        let evidence = Evidence {
            api: Some(XmlApi::StdlibElementTree),
            has_user_input_flow: true,
            enclosing_function: Some("handle_upload".to_string()),
            ..Default::default()
        };
        let p = predict(&evidence);
        assert_eq!(p.predicted, BranchLabel::RealBug);
        assert_eq!(p.predicted_severity, Severity::Critical);
        assert_eq!(p.alternative_branch.label, BranchLabel::Benign);
        assert_eq!(p.alternative_branch.severity, Severity::Info);
        let total: f32 = p.reasons.iter().map(|r| r.weight).sum();
        assert!(
            (total - (W_STDLIB_UNSAFE_PARSER + W_USER_INPUT_FLOW + W_ENCLOSING_HANDLER)).abs()
                < 1e-6,
            "expected -1.00, got {total}"
        );
    }

    // ─── Worked example 3 (decisions D5 #1): unused defusedxml import. ───
    //
    // `import defusedxml` + stdlib parse with no other signals.
    // +0.30 + -0.20 = +0.10 → Benign tiebreak-close (documented v0
    // false-positive).
    #[test]
    fn unused_defusedxml_import_with_stdlib_parse_predicts_benign() {
        let evidence = Evidence {
            api: Some(XmlApi::StdlibElementTree),
            import_defusedxml: true,
            ..Default::default()
        };
        let p = predict(&evidence);
        assert_eq!(p.predicted, BranchLabel::Benign);
        assert_eq!(p.predicted_severity, Severity::Info);
        let total: f32 = p.reasons.iter().map(|r| r.weight).sum();
        assert!(
            (total - (W_IMPORT_DEFUSEDXML + W_STDLIB_UNSAFE_PARSER)).abs() < 1e-6,
            "expected +0.10, got {total}"
        );
    }

    // ─── Worked example 4 (decisions D1 honest-review #1): the
    // unused-defusedxml case STILL flips RealBug when any negative
    // signal fires. This is the calibration justification for the
    // +0.30 weight choice. ───
    #[test]
    fn unused_defusedxml_in_handler_with_user_input_correctly_flips_realbug() {
        let evidence = Evidence {
            api: Some(XmlApi::StdlibElementTree),
            import_defusedxml: true,
            has_user_input_flow: true,
            enclosing_function: Some("handle_upload".to_string()),
            ..Default::default()
        };
        let p = predict(&evidence);
        assert_eq!(p.predicted, BranchLabel::RealBug);
        // +0.30 + -0.20 + -0.50 + -0.30 = -0.70
        let total: f32 = p.reasons.iter().map(|r| r.weight).sum();
        assert!((total - -0.70).abs() < 1e-6, "expected -0.70, got {total}");
    }

    // ─── Worked example 5: canonical defusedxml safe usage. ───
    #[test]
    fn defusedxml_canonical_usage_predicts_benign() {
        // Defusedxml call → Step 1.5 collapse → Benign with the
        // single `W_API_DEFUSEDXML_CALL`-weighted reason. The
        // `import_defusedxml` signal is not added by the collapse
        // path (collapsing returns immediately) but that's fine:
        // the call-site signal is decisive.
        let evidence = Evidence {
            api: Some(XmlApi::Defusedxml),
            import_defusedxml: true,
            ..Default::default()
        };
        let p = predict(&evidence);
        assert_eq!(p.predicted, BranchLabel::Benign);
        assert_eq!(p.predicted_severity, Severity::Info);
        assert!(
            p.reasons.iter().any(|r| r.weight == W_API_DEFUSEDXML_CALL),
            "must emit the defusedxml-call reason"
        );
    }

    // ─── Worked example 5b: defusedxml call with user input (the
    // case that surfaced the missing W_API_DEFUSEDXML_CALL signal
    // during integration testing — pins the fix).
    #[test]
    fn defusedxml_call_with_user_input_stays_benign() {
        // Without the call-site collapse: +0.30 (import) - 0.50
        // (input) - 0.30 (handler) = -0.50 → RealBug. WRONG —
        // defusedxml is safe-by-construction regardless of input.
        // With the collapse: Step 1.5 returns Benign immediately,
        // regardless of downstream additive signals.
        let evidence = Evidence {
            api: Some(XmlApi::Defusedxml),
            import_defusedxml: true,
            has_user_input_flow: true,
            enclosing_function: Some("handler".to_string()),
            ..Default::default()
        };
        let p = predict(&evidence);
        assert_eq!(
            p.predicted,
            BranchLabel::Benign,
            "defusedxml call stays Benign even with user input + handler"
        );
        assert_eq!(p.predicted_severity, Severity::Info);
    }

    // ─── Collapsing annotations ───
    #[test]
    fn xxe_safe_annotation_collapses_to_benign() {
        let evidence = Evidence {
            // Other signals say RealBug, but the annotation overrides.
            api: Some(XmlApi::StdlibElementTree),
            has_user_input_flow: true,
            enclosing_function: Some("handle_upload".to_string()),
            xxe_safe_annotation: Some("xsd-validated-upstream".to_string()),
            ..Default::default()
        };
        let p = predict(&evidence);
        assert_eq!(p.predicted, BranchLabel::Benign);
        assert_eq!(p.predicted_severity, Severity::Info);
        assert_eq!(p.resolutions.len(), 1);
        assert!(matches!(
            p.resolutions[0].kind,
            ResolutionKind::SourceAnnotation { .. }
        ));
    }

    #[test]
    fn xxe_vulnerable_annotation_collapses_to_realbug() {
        let evidence = Evidence {
            // Other signals say Benign, but the annotation overrides.
            api: Some(XmlApi::Defusedxml),
            import_defusedxml: true,
            kw_forbid_dtd_true: true,
            xxe_vulnerable_annotation: Some("audited-untrusted-source".to_string()),
            ..Default::default()
        };
        let p = predict(&evidence);
        assert_eq!(p.predicted, BranchLabel::RealBug);
        // No user input → severity High.
        assert_eq!(p.predicted_severity, Severity::High);
    }

    // ─── Tiebreak ───
    #[test]
    fn empty_evidence_tiebreaks_realbug() {
        let p = predict(&Evidence::empty());
        assert_eq!(p.predicted, BranchLabel::RealBug);
        // No user input → severity High.
        assert_eq!(p.predicted_severity, Severity::High);
    }

    // ─── Sign convention ───
    //
    // These tests pin the sign convention. A future edit changing a
    // sign would silently flip the predicted branch — these
    // assertions catch that at test time.
    #[test]
    #[allow(clippy::assertions_on_constants)]
    fn realbug_signal_weights_are_negative() {
        assert!(W_USER_INPUT_FLOW < 0.0);
        assert!(W_ENCLOSING_HANDLER < 0.0);
        assert!(W_STDLIB_UNSAFE_PARSER < 0.0);
    }

    #[test]
    #[allow(clippy::assertions_on_constants)]
    fn benign_signal_weights_are_positive() {
        assert!(W_IMPORT_DEFUSEDXML > 0.0);
        assert!(W_IMPORT_LXML_ETREE > 0.0);
        assert!(W_KW_RESOLVE_ENTITIES_FALSE > 0.0);
        assert!(W_KW_NO_NETWORK_TRUE > 0.0);
        assert!(W_KW_FORBID_DTD_TRUE > 0.0);
        assert!(W_ENCLOSING_TEST_FUNCTION > 0.0);
    }

    // ─── Honest review note pin: defusedxml weight calibration. ───
    //
    // The decisions doc D1 explicitly calibrates the defusedxml import
    // weight against the unused-import + handler + user-input scenario.
    // This test pins the constraint: ANY single negative signal must
    // be able to overcome the unused-import + stdlib-unsafe positive
    // (0.30 - 0.20 = 0.10).
    #[test]
    #[allow(clippy::assertions_on_constants)]
    fn defusedxml_weight_is_overcome_by_single_negative_signal() {
        let unused_import_baseline = W_IMPORT_DEFUSEDXML + W_STDLIB_UNSAFE_PARSER;
        // 0.10 — barely positive.
        assert!(unused_import_baseline > 0.0);
        assert!(unused_import_baseline < W_KW_RESOLVE_ENTITIES_FALSE.abs());
        // Each negative signal individually larger than baseline:
        assert!(W_USER_INPUT_FLOW.abs() > unused_import_baseline);
        assert!(W_ENCLOSING_HANDLER.abs() > unused_import_baseline);
    }

    // ─── Lexicon checks ───
    #[test]
    fn user_input_lexicon() {
        assert!(matches_user_input("blob = request.data"));
        assert!(matches_user_input("uploaded_xml = req.files['x']"));
        assert!(matches_user_input("content = request.get_data()"));
        assert!(!matches_user_input("blob = open('config.xml').read()"));
    }

    #[test]
    fn handler_lexicon() {
        assert!(matches_handler_function("handle_upload"));
        assert!(matches_handler_function("parse_route"));
        assert!(matches_handler_function("import_data"));
        assert!(!matches_handler_function("calculate_total"));
    }

    #[test]
    fn test_function_lexicon() {
        assert!(matches_test_function("test_xxe_parse"));
        assert!(matches_test_function("xxe_test"));
        assert!(matches_test_function("setup_fixture"));
        assert!(!matches_test_function("parse_data"));
    }

    // ─── Extract helpers ───
    #[test]
    fn extract_xxe_safe_with_reason() {
        assert_eq!(
            extract_xxe_safe_reason("ET.parse(blob)  # repotoire: xxe-safe[xsd-validated]"),
            Some("xsd-validated".to_string())
        );
    }

    #[test]
    fn extract_xxe_safe_without_reason() {
        assert_eq!(
            extract_xxe_safe_reason("ET.parse(blob)  # repotoire: xxe-safe"),
            Some("unspecified".to_string())
        );
    }

    #[test]
    fn extract_xxe_vulnerable_with_source() {
        assert_eq!(
            extract_xxe_vulnerable_source("ET.parse(blob)  # repotoire: xxe-vulnerable[audited]"),
            Some("audited".to_string())
        );
    }

    #[test]
    fn extract_xxe_safe_ignores_other_kinds() {
        // Other detectors' annotations must NOT match the 2e extractor.
        assert_eq!(
            extract_xxe_safe_reason("subprocess.run(...)  # repotoire: command-static[ok]"),
            None
        );
        assert_eq!(
            extract_xxe_safe_reason("os.path.join(...)  # repotoire: internal-path[ok]"),
            None
        );
    }

    #[test]
    fn extract_xxe_vulnerable_ignores_other_kinds() {
        assert_eq!(
            extract_xxe_vulnerable_source(
                "subprocess.run(...)  # repotoire: command-user-controlled[GET]"
            ),
            None
        );
    }

    // ─── XmlApi helpers ───
    #[test]
    fn xml_api_is_stdlib_unsafe() {
        assert!(XmlApi::StdlibElementTree.is_stdlib_unsafe());
        assert!(XmlApi::StdlibOther.is_stdlib_unsafe());
        assert!(!XmlApi::Defusedxml.is_stdlib_unsafe());
        assert!(!XmlApi::LxmlEtree.is_stdlib_unsafe());
        assert!(!XmlApi::Unknown.is_stdlib_unsafe());
    }
}