repotoire 0.8.0

//! XXE Injection Detector
//!
//! Graph-enhanced detection of XXE vulnerabilities:
//! - Detect XML parsers without secure configuration
//! - Language-specific protection checks
//! - Trace user input to XML parsing

// Phase 2e dual-branch submodules (scaffolding; integration in the
// next commit on the stack).
mod annotation;
mod evidence;
mod predict;

use crate::detectors::base::{Detector, DetectorConfig};
use crate::graph::GraphQueryExt;
use crate::models::{deterministic_finding_id, Finding, Severity};
use anyhow::Result;
use regex::Regex;
use std::path::{Path, PathBuf};
use std::sync::LazyLock;
use tracing::info;

static XXE_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
    Regex::new(r"(?i)(xml\.parse|parseXML|XMLParser|DocumentBuilder|SAXParser|etree\.parse|lxml\.etree|xml\.etree|DOMParser|XMLReader|xml\.dom|minidom|pulldom|xml2js|fast-xml-parser|libxml)").expect("valid regex")
});
static USER_INPUT: LazyLock<Regex> = LazyLock::new(|| {
    Regex::new(r"(req\.(body|file|files)|request\.(data|files)|uploaded|file_content|input|read\(|getInputStream)").expect("valid regex")
});

/// Get language-specific protection patterns
fn get_protection_patterns(ext: &str) -> Vec<&'static str> {
    match ext {
        "py" => vec![
            "resolve_entities=False",
            "no_network=True",
            "defusedxml",
            "forbid_dtd=True",
            "forbid_entities=True",
            "feature_external_ges",
            "feature_external_pes",
            "DTDForbidden",
            "EntitiesForbidden",
            "ExternalReferenceForbidden",
            "defused",
        ],
        "java" => vec![
            "FEATURE_SECURE_PROCESSING",
            "FEATURE_EXTERNAL_GENERAL_ENTITIES",
            "FEATURE_EXTERNAL_PARAMETER_ENTITIES",
            "FEATURE_DISALLOW_DOCTYPE_DECL",
            "setExpandEntityReferences(false)",
        ],
        "js" | "ts" => vec![
            "noent: false",
            "nonet: true",
            "dtdload: false",
            "dtdvalid: false",
            "explicitEntities: false",
        ],
        "php" => vec![
            "LIBXML_NOENT",
            "LIBXML_DTDLOAD",
            "libxml_disable_entity_loader",
        ],
        "cs" => vec![
            "DtdProcessing.Prohibit",
            "XmlResolver = null",
            "ProhibitDtd = true",
        ],
        "rb" => vec![
            "nonet: true",
            "noent: false",
            "Nokogiri::XML::ParseOptions::NONET",
        ],
        _ => vec![],
    }
}

/// Get language-specific fix example
fn get_fix_example(ext: &str) -> &'static str {
    match ext {
        "py" => {
            "```python\n\
             # Use defusedxml (recommended)\n\
             import defusedxml.ElementTree as ET\n\
             tree = ET.parse(xml_file)\n\
             \n\
             # Or configure lxml safely\n\
             from lxml import etree\n\
             parser = etree.XMLParser(\n\
                 resolve_entities=False,\n\
                 no_network=True,\n\
                 dtd_validation=False\n\
             )\n\
             tree = etree.parse(xml_file, parser)\n\
             ```"
        }
        "java" => {
            "```java\n\
             DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();\n\
             \n\
             // Disable XXE\n\
             dbf.setFeature(\"http://apache.org/xml/features/disallow-doctype-decl\", true);\n\
             dbf.setFeature(\"http://xml.org/sax/features/external-general-entities\", false);\n\
             dbf.setFeature(\"http://xml.org/sax/features/external-parameter-entities\", false);\n\
             dbf.setXIncludeAware(false);\n\
             dbf.setExpandEntityReferences(false);\n\
             \n\
             DocumentBuilder db = dbf.newDocumentBuilder();\n\
             ```"
        }
        "js" | "ts" => {
            "```javascript\n\
             // Use a safe parser\n\
             const { XMLParser } = require('fast-xml-parser');\n\
             const parser = new XMLParser({\n\
                 allowBooleanAttributes: true,\n\
                 // No external entity resolution by default\n\
             });\n\
             \n\
             // Or configure libxmljs safely\n\
             const libxmljs = require('libxmljs');\n\
             const doc = libxmljs.parseXml(xmlString, {\n\
                 noent: false,  // Don't expand entities\n\
                 nonet: true,   // Don't fetch from network\n\
                 dtdload: false\n\
             });\n\
             ```"
        }
        "php" => {
            "```php\n\
             // Disable entity loading (PHP < 8.0)\n\
             libxml_disable_entity_loader(true);\n\
             \n\
             // Use LIBXML_NOENT and LIBXML_DTDLOAD flags\n\
             $doc = new DOMDocument();\n\
             $doc->loadXML($xml, LIBXML_NONET | LIBXML_DTDLOAD);\n\
             \n\
             // Better: Use SimpleXML with safe options\n\
             $xml = simplexml_load_string($data, 'SimpleXMLElement', LIBXML_NOENT);\n\
             ```"
        }
        "cs" => {
            "```csharp\n\
             XmlReaderSettings settings = new XmlReaderSettings();\n\
             settings.DtdProcessing = DtdProcessing.Prohibit;\n\
             settings.XmlResolver = null;\n\
             \n\
             using (XmlReader reader = XmlReader.Create(stream, settings))\n\
             {\n\
                 // Process XML safely\n\
             }\n\
             ```"
        }
        _ => "Disable external entity resolution in your XML parser configuration.",
    }
}

pub struct XxeDetector {
    repository_path: PathBuf,
    max_findings: usize,
    precomputed_cross: std::sync::OnceLock<Vec<crate::detectors::taint::TaintPath>>,
    precomputed_intra: std::sync::OnceLock<Vec<crate::detectors::taint::TaintPath>>,
}

impl XxeDetector {
    pub fn new(repository_path: impl Into<PathBuf>) -> Self {
        Self {
            repository_path: repository_path.into(),
            max_findings: 50,
            precomputed_cross: std::sync::OnceLock::new(),
            precomputed_intra: std::sync::OnceLock::new(),
        }
    }

    /// Check for XXE protections in file content
    fn has_protection(content: &str, ext: &str) -> bool {
        let patterns = get_protection_patterns(ext);
        let content_lower = content.to_lowercase();

        patterns
            .iter()
            .any(|p| content_lower.contains(&p.to_lowercase()))
    }

    /// Check if user input flows to XML parsing
    fn has_user_input_flow(lines: &[&str], parse_line: usize) -> bool {
        let start = parse_line.saturating_sub(10);
        let context = lines[start..parse_line].join(" ");

        USER_INPUT.is_match(&context)
    }

    /// Phase 2e dual-branch scan path (Python only).
    ///
    /// Parses the file once, collects every XML parse call site via
    /// [`evidence::collect_python_xml_sites`], extracts evidence,
    /// runs [`predict::predict`], and builds a dual-branch finding
    /// per site. Replaces the legacy line-regex pass for `.py` files
    /// when the `xxe` dual-branch flag is on.
    ///
    /// Returns an empty vec if the file has no XML imports (fast
    /// path inside the collector) or fails to parse.
    fn scan_python_file_dual_branch(&self, path: &Path, content: &str) -> Vec<Finding> {
        if content.contains('\0') {
            return Vec::new();
        }
        let Some(tree) = crate::detectors::ast_fingerprint::parse_root_ext(
            content,
            crate::parsers::lightweight::Language::Python,
            "py",
        ) else {
            return Vec::new();
        };
        let root = tree.root_node();
        let source = content.as_bytes();
        let lines: Vec<&str> = content.lines().collect();

        let mut findings = Vec::new();
        for site in evidence::collect_python_xml_sites(root, source) {
            let line_idx = site.call_node.start_position().row;

            // Honor `# repotoire: ignore` / inline suppressions same
            // as the legacy path. Without this, users who suppressed
            // the legacy finding would see a new dual-branch finding
            // appear when they flip the flag on — a regression.
            if let Some(line) = lines.get(line_idx) {
                let prev = if line_idx > 0 {
                    Some(lines[line_idx - 1])
                } else {
                    None
                };
                if crate::detectors::is_line_suppressed(line, prev) {
                    continue;
                }
            }

            let snippet = lines.get(line_idx).map(|s| s.trim()).unwrap_or("");
            let line_num = (line_idx + 1) as u32;

            findings.push(self.build_dual_branch_xxe_finding(
                path,
                line_num,
                site.api,
                snippet,
                site.call_node,
                root,
                source,
                &lines,
            ));
        }
        findings
    }

    /// Build a dual-branch Finding for a single Python XXE call site.
    ///
    /// Mirrors `command_injection::build_dual_branch_python_finding`
    /// (commit `6c47b271`): pull evidence, run the predictor, pick a
    /// title/description/fix per branch label, attach the
    /// alternative branch + every prediction reason + every
    /// resolution signal. The result is a single `Finding` with the
    /// dual-branch shape that `--show-alternatives` knows how to
    /// render.
    fn build_dual_branch_xxe_finding(
        &self,
        path: &Path,
        line_num: u32,
        api: predict::XmlApi,
        snippet: &str,
        call_node: tree_sitter::Node<'_>,
        module_root: tree_sitter::Node<'_>,
        source: &[u8],
        lines: &[&str],
    ) -> Finding {
        let api_label = api.callee_label();
        let mut ev = evidence::extract_python_evidence(call_node, module_root, source, lines);
        ev.api = Some(api);
        let prediction = predict::predict(&ev);

        let predicted_label = prediction.predicted;
        let predicted_severity = prediction.predicted_severity;
        let predicted_title = match predicted_label {
            crate::dual_branch::BranchLabel::RealBug => {
                format!("Potential XXE via {api_label}")
            }
            crate::dual_branch::BranchLabel::Benign => {
                format!("Hardened XML parse via {api_label} (informational)")
            }
        };
        let predicted_description = format!(
            "**XML External Entity (dual-branch, CWE-611)**\n\n\
             **API**: `{}`\n\n\
             **Location**: {}:{}\n\n\
             **Code**:\n```python\n{}\n```\n\n\
             {}",
            api_label,
            path.display(),
            line_num,
            snippet,
            match predicted_label {
                crate::dual_branch::BranchLabel::RealBug => format!(
                    "The `{api_label}` call site does not show evidence of \
                     entity-resolution protection (no `resolve_entities=False`, \
                     `no_network=True`, or `forbid_dtd=True` kwarg co-located \
                     with the parse), and/or operates on attacker-influenceable \
                     input. The predictor leans RealBug for this call site \
                     (see `prediction_reasons`)."
                ),
                crate::dual_branch::BranchLabel::Benign => format!(
                    "The `{api_label}` call site uses a safe-by-default API \
                     (defusedxml) or pairs explicit protection kwargs with \
                     the parse. The predictor leans Benign (see \
                     `prediction_reasons`); the original `severity_for`-table \
                     interpretation is carried in `alternative_branch`."
                ),
            },
        );
        let predicted_fix = match predicted_label {
            crate::dual_branch::BranchLabel::RealBug => Some(format!(
                "{}\n\nIf this is a false positive (the parser is configured \
                 securely via a path the v0 predictor doesn't see — e.g. a \
                 parser variable built in a helper function), annotate the \
                 call site with `# repotoire: xxe-safe[<reason>]` to collapse \
                 the finding to Info.",
                get_fix_example("py")
            )),
            crate::dual_branch::BranchLabel::Benign => Some(
                "If this parse IS attacker-reachable (the alternative \
                 branch), switch to `defusedxml.ElementTree.parse(...)` or \
                 explicit lxml protection: `lxml.etree.XMLParser(\
                 resolve_entities=False, no_network=True)`. If the predictor \
                 is correct that this is a hardened call, no action needed."
                    .to_string(),
            ),
        };

        let mut finding = Finding {
            id: String::new(),
            detector: "XxeDetector".to_string(),
            severity: predicted_severity,
            title: predicted_title,
            description: predicted_description,
            affected_files: vec![path.to_path_buf()],
            line_start: Some(line_num),
            line_end: Some(line_num),
            suggested_fix: predicted_fix,
            estimated_effort: Some("20 minutes".to_string()),
            category: Some("security".to_string()),
            cwe_id: Some("CWE-611".to_string()),
            why_it_matters: Some(
                "XXE vulnerabilities allow attackers to:\n\
                 • Read arbitrary files from the server (file:///etc/passwd)\n\
                 • Perform SSRF attacks (http://internal-server/)\n\
                 • Denial of service (billion laughs attack)\n\
                 • Port scanning of internal networks"
                    .to_string(),
            ),
            ..Default::default()
        };

        finding = finding.with_alternative_branch(prediction.alternative_branch);
        for reason in prediction.reasons {
            finding = finding.with_prediction_reason(reason);
        }
        for resolution in prediction.resolutions {
            finding = finding.with_resolution_signal(resolution);
        }
        finding
    }
}

impl Detector for XxeDetector {
    fn name(&self) -> &'static str {
        "xxe"
    }
    fn description(&self) -> &'static str {
        "Detects XXE vulnerabilities"
    }

    fn bypass_postprocessor(&self) -> bool {
        true
    }

    crate::detectors::impl_taint_precompute!();

    fn taint_category(&self) -> Option<crate::detectors::taint::TaintCategory> {
        // XXE uses its own taint category with XML-specific sinks
        // (per docs/superpowers/specs/2026-05-09-detector-precision-batch1-design.md §6).
        Some(crate::detectors::taint::TaintCategory::Xxe)
    }

    fn file_extensions(&self) -> &'static [&'static str] {
        &["py", "js", "ts", "jsx", "tsx", "rb", "php", "java"]
    }

    fn content_requirements(&self) -> crate::detectors::detector_context::ContentFlags {
        crate::detectors::detector_context::ContentFlags::HAS_SERIALIZE
    }

    fn detect(
        &self,
        ctx: &crate::detectors::analysis_context::AnalysisContext,
    ) -> Result<Vec<Finding>> {
        let graph = ctx.graph;
        let files = &ctx.as_file_provider();
        let mut findings = vec![];

        // Phase 2e dual-branch gate. When `true`, Python `.py` files
        // go through the AST-driven predictor path
        // (`scan_python_file_dual_branch`) and skip the legacy line
        // scanner. Other languages and the flag-off path are
        // unchanged. Symmetric with command-injection's `flag_on`
        // (commit 6c47b271).
        let flag_on = ctx.dual_branch.is_enabled_for("xxe");

        for path in
            files.files_with_extensions(&["py", "js", "ts", "java", "php", "cs", "rb", "go"])
        {
            if findings.len() >= self.max_findings {
                break;
            }

            let path_str = path.to_string_lossy().to_string();

            // Skip test files
            if crate::detectors::base::is_test_path(&path_str) {
                continue;
            }

            let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");

            // Phase 2e: AST-driven predictor path for Python when the
            // dual-branch flag is on. Replaces the legacy regex pass
            // for `.py` files; other languages and the flag-off path
            // fall through to the regex scanner below.
            if flag_on && ext == "py" {
                if let Some(content) = files.content(path) {
                    let dual = self.scan_python_file_dual_branch(path, &content);
                    for finding in dual {
                        findings.push(finding);
                        if findings.len() >= self.max_findings {
                            break;
                        }
                    }
                }
                continue;
            }

            if let Some(content) = files.content(path) {
                // Don't skip entire file — check protection near each parse call (#16)
                let file_has_any_protection = Self::has_protection(&content, ext);

                let lines: Vec<&str> = content.lines().collect();

                for (i, line) in lines.iter().enumerate() {
                    let prev_line = if i > 0 { Some(lines[i - 1]) } else { None };
                    if crate::detectors::is_line_suppressed(line, prev_line) {
                        continue;
                    }

                    if !XXE_PATTERN.is_match(line) {
                        continue;
                    }

                    // Skip import-only lines — importing a module is not a vulnerability
                    let trimmed = line.trim();
                    if trimmed.starts_with("from ") || trimmed.starts_with("import ") {
                        continue;
                    }

                    // Skip lines that reference XML modules without actual parse calls
                    let has_parse_call = line.contains(".parse(")
                        || line.contains(".parseString(")
                        || line.contains("XMLParser(")
                        || line.contains("DocumentBuilder")
                        || line.contains("SAXParser(")
                        || line.contains("XMLReader(");
                    if !has_parse_call {
                        continue;
                    }

                    // Skip JS static data (globals lists, config objects)
                    if ext == "js" {
                        let trimmed_line = line.trim();
                        if trimmed_line.ends_with("false,")
                            || trimmed_line.ends_with("true,")
                            || trimmed_line.ends_with("false")
                            || trimmed_line.ends_with("true")
                        {
                            continue;
                        }
                    }

                    // Check for protection near this parse call, not just file-wide (#16)
                    // Look 15 lines before and after for protection patterns
                    if file_has_any_protection {
                        let local_start = i.saturating_sub(15);
                        let local_end = (i + 15).min(lines.len());
                        let local_context = lines[local_start..local_end].join("\n");
                        if Self::has_protection(&local_context, ext) {
                            continue; // This specific parser is protected
                        }
                    }

                    // Check for user input flow
                    let has_user_input = Self::has_user_input_flow(&lines, i);

                    // Get function context
                    let func_context = graph.find_function_at(&path_str, (i + 1) as u32).map(|f| {
                        let callers =
                            graph.get_callers(f.qn(crate::graph::interner::global_interner()));
                        let has_external_callers = callers.iter().any(|c| {
                            let name = c
                                .node_name(crate::graph::interner::global_interner())
                                .to_lowercase();
                            name.contains("route")
                                || name.contains("handler")
                                || name.contains("api")
                                || name.contains("upload")
                                || name.contains("import")
                                || name.contains("parse")
                        });
                        (
                            f.node_name(crate::graph::interner::global_interner())
                                .to_string(),
                            has_external_callers,
                        )
                    });

                    // Calculate severity
                    let severity = if has_user_input {
                        Severity::Critical // User input directly to XML parser
                    } else {
                        Severity::High // XXE is always serious
                    };

                    // Build notes
                    let mut notes = Vec::new();
                    if has_user_input {
                        notes.push("⚠️ User input flows to XML parser".to_string());
                    }
                    if let Some((func_name, external)) = &func_context {
                        notes.push(format!("📦 In function: `{}`", func_name));
                        if *external {
                            notes.push("🌐 Called from route handlers".to_string());
                        }
                    }
                    notes.push(format!("❌ No XXE protection detected for {}", ext));

                    let context_notes = format!("\n\n**Analysis:**\n{}", notes.join("\n"));

                    findings.push(Finding {
                        id: String::new(),
                        detector: "XxeDetector".to_string(),
                        severity,
                        title: "XML External Entity (XXE) vulnerability".to_string(),
                        description: format!(
                            "XML parser processes external entities without proper restrictions.{}",
                            context_notes
                        ),
                        affected_files: vec![path.to_path_buf()],
                        line_start: Some((i + 1) as u32),
                        line_end: Some((i + 1) as u32),
                        suggested_fix: Some(get_fix_example(ext).to_string()),
                        estimated_effort: Some("20 minutes".to_string()),
                        category: Some("security".to_string()),
                        cwe_id: Some("CWE-611".to_string()),
                        why_it_matters: Some(
                            "XXE vulnerabilities allow attackers to:\n\
                             • Read arbitrary files from the server (file:///etc/passwd)\n\
                             • Perform SSRF attacks (http://internal-server/)\n\
                             • Denial of service (billion laughs attack)\n\
                             • Port scanning of internal networks"
                                .to_string(),
                        ),
                        ..Default::default()
                    });
                }
            }
        }

        // Supplement with intra-function taint analysis (precomputed or fallback)
        let intra_paths = if let Some(intra) = self.precomputed_intra.get() {
            intra.clone()
        } else {
            let taint_analyzer = crate::detectors::taint::TaintAnalyzer::new();
            crate::detectors::taint::run_intra_function_taint(
                // XXE uses its own taint category with XML-specific sinks
                // (per docs/superpowers/specs/2026-05-09-detector-precision-batch1-design.md §6).
                &taint_analyzer,
                graph,
                crate::detectors::taint::TaintCategory::Xxe,
                &self.repository_path,
            )
        };
        let mut seen: std::collections::HashSet<(String, u32)> = findings
            .iter()
            .filter_map(|f| {
                f.affected_files
                    .first()
                    .map(|p| (p.to_string_lossy().to_string(), f.line_start.unwrap_or(0)))
            })
            .collect();
        for path in intra_paths.iter().filter(|p| !p.is_sanitized) {
            let loc = (path.sink_file.clone(), path.sink_line);
            if !seen.insert(loc) {
                continue;
            }
            findings.push(crate::detectors::taint::taint_path_to_finding(
                path,
                "XxeDetector",
                "XML External Entity Injection",
            ));
            if findings.len() >= self.max_findings {
                break;
            }
        }

        info!(
            "XxeDetector found {} findings (graph-aware + taint)",
            findings.len()
        );
        Ok(findings)
    }
}

impl crate::detectors::RegisteredDetector for XxeDetector {
    fn create(init: &crate::detectors::DetectorInit) -> std::sync::Arc<dyn Detector> {
        std::sync::Arc::new(Self::new(init.repo_path))
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::graph::builder::GraphBuilder;

    #[test]
    fn test_detects_xxe_without_protection() {
        let store = GraphBuilder::new().freeze();
        let detector = XxeDetector::new("/mock/repo");
        let ctx = crate::detectors::analysis_context::AnalysisContext::test_with_mock_files(
            &store,
            vec![(
                "parser.py",
                "\nfrom lxml import etree\ntree = etree.parse(xml_file)\n",
            )],
        );
        let findings = detector.detect(&ctx).expect("detection should succeed");
        assert!(
            !findings.is_empty(),
            "Should detect XML parsing without XXE protection"
        );
        assert!(findings.iter().any(|f| f.detector == "XxeDetector"));
    }

    #[test]
    fn test_no_finding_with_defusedxml() {
        let store = GraphBuilder::new().freeze();
        let detector = XxeDetector::new("/mock/repo");
        let ctx = crate::detectors::analysis_context::AnalysisContext::test_with_mock_files(
            &store,
            vec![(
                "safe_parser.py",
                "\nimport defusedxml.ElementTree as ET\ntree = ET.parse(xml_file)\n",
            )],
        );
        let findings = detector.detect(&ctx).expect("detection should succeed");
        assert!(
            findings.is_empty(),
            "Should not flag XML parsing with defusedxml protection, but got: {:?}",
            findings.iter().map(|f| &f.title).collect::<Vec<_>>()
        );
    }

    #[test]
    fn test_no_finding_for_import_only() {
        let store = GraphBuilder::new().freeze();
        let detector = XxeDetector::new("/mock/repo");
        let ctx = crate::detectors::analysis_context::AnalysisContext::test_with_mock_files(
            &store,
            vec![(
                "parser.py",
                "from xml.dom import minidom, pulldom\nimport xml.etree.ElementTree as ET\n",
            )],
        );
        let findings = detector.detect(&ctx).expect("detection should succeed");
        assert!(
            findings.is_empty(),
            "Should not flag import-only lines. Found: {:?}",
            findings.iter().map(|f| &f.title).collect::<Vec<_>>()
        );
    }

    #[test]
    fn test_no_finding_with_custom_defused_parser() {
        let store = GraphBuilder::new().freeze();
        let detector = XxeDetector::new("/mock/repo");
        let ctx = crate::detectors::analysis_context::AnalysisContext::test_with_mock_files(&store, vec![
            ("serializer.py", "class DefusedExpatParser:\n    feature_external_ges = False\n    feature_external_pes = False\n    def reset(self):\n        raise DTDForbidden()\n\ndef deserialize(stream):\n    event_stream = pulldom.parse(stream, parser)\n"),
        ]);
        let findings = detector.detect(&ctx).expect("detection should succeed");
        assert!(
            findings.is_empty(),
            "Should not flag XML parsing when custom defused parser exists. Found: {:?}",
            findings.iter().map(|f| &f.title).collect::<Vec<_>>()
        );
    }

    #[test]
    fn test_no_finding_for_js_static_data() {
        let store = GraphBuilder::new().freeze();
        let detector = XxeDetector::new("/mock/repo");
        let ctx = crate::detectors::analysis_context::AnalysisContext::test_with_mock_files(
            &store,
            vec![(
                "globals.js",
                "var globals = {\n    \"DOMParser\": false,\n    \"XMLHttpRequest\": false,\n};\n",
            )],
        );
        let findings = detector.detect(&ctx).expect("detection should succeed");
        assert!(
            findings.is_empty(),
            "Should not flag JS static data. Found: {:?}",
            findings.iter().map(|f| &f.title).collect::<Vec<_>>()
        );
    }

    // ---- §6 architectural-fix regression tests ----------------------------
    //
    // These tests pin the invariant that XxeDetector emits findings ONLY for
    // real XML sinks. Before the fix, XxeDetector borrowed PathTraversal taint
    // paths and relabeled them XXE — producing false positives like Click
    // utils.py:490 (os.path.join flagged as XXE despite zero XML imports).

    /// Regression for the Click utils.py:490 false positive.
    /// A function that joins paths with `os.path.join` and has no XML imports
    /// must not produce any XXE finding.
    #[test]
    fn test_no_xxe_finding_for_path_traversal_sink_without_xml_imports() {
        use crate::detectors::taint::{TaintAnalyzer, TaintCategory};
        // Mirror Click's utils.py:490 shape exactly: argv-form path-join,
        // tainted by user input, no XML APIs anywhere.
        let func_source = "\
def get_app_dir(app_name, roaming=True, force_posix=False):
    folder = os.environ.get('XDG_CONFIG_HOME', '~/.config')
    return os.path.join(folder, app_name)
";
        let analyzer = TaintAnalyzer::new();
        let paths = analyzer.analyze_intra_function(
            func_source,
            "get_app_dir",
            "click/utils.py",
            488,
            crate::parsers::lightweight::Language::Python,
            TaintCategory::Xxe,
        );
        assert!(
            paths.is_empty(),
            "XXE category must NOT trace taint paths to os.path.join (it's a path-traversal sink, not XML). \
             Got paths: {:?}",
            paths.iter().map(|p| &p.sink_function).collect::<Vec<_>>()
        );
    }

    /// Verify that the XXE category still fires on real XML taint paths.
    /// Without this, the §6 fix could reduce recall to zero.
    #[test]
    fn test_xxe_finding_emitted_for_real_xml_taint_path() {
        use crate::detectors::taint::{TaintAnalyzer, TaintCategory};
        let func_source = "\
def parse_user_xml(request):
    user_data = request.body
    return lxml.etree.parse(user_data)
";
        let analyzer = TaintAnalyzer::new();
        let paths = analyzer.analyze_intra_function(
            func_source,
            "parse_user_xml",
            "app/parser.py",
            1,
            crate::parsers::lightweight::Language::Python,
            TaintCategory::Xxe,
        );
        assert!(
            !paths.is_empty(),
            "XXE category must still trace taint paths to real XML sinks (lxml.etree.parse). \
             The §6 fix must not destroy recall on real XXE patterns."
        );
        assert!(
            paths.iter().any(|p| p.sink_function.contains("etree")
                || p.sink_function.contains("lxml")
                || p.sink_function.contains("XMLParser")),
            "Sink function should reference an XML API, got: {:?}",
            paths.iter().map(|p| &p.sink_function).collect::<Vec<_>>()
        );
    }

    /// Stronger property: any XXE taint path must cite an XML API in its
    /// description. Catches future regressions where wrong-category sinks
    /// might leak back into the XXE detector.
    #[test]
    fn test_xxe_taint_finding_cites_xml_api_in_description() {
        use crate::detectors::taint::{taint_path_to_finding, TaintAnalyzer, TaintCategory};
        let func_source = "\
def parse_user_xml(request):
    user_data = request.body
    return lxml.etree.parse(user_data)
";
        let analyzer = TaintAnalyzer::new();
        let paths = analyzer.analyze_intra_function(
            func_source,
            "parse_user_xml",
            "app/parser.py",
            1,
            crate::parsers::lightweight::Language::Python,
            TaintCategory::Xxe,
        );
        assert!(!paths.is_empty(), "must produce at least one XXE path");

        let xml_keywords = [
            "etree",
            "lxml",
            "xml",
            "XMLParser",
            "DOMParser",
            "DocumentBuilder",
            "SAXParser",
            "XMLReader",
            "minidom",
            "pulldom",
            "Nokogiri",
            "DOMDocument",
            "simplexml",
        ];
        for path in &paths {
            let finding =
                taint_path_to_finding(path, "XxeDetector", "XML External Entity Injection");
            let cites_xml = xml_keywords
                .iter()
                .any(|kw| finding.description.contains(kw));
            assert!(
                cites_xml,
                "XXE finding description must cite an XML API name; got: {}",
                finding.description
            );
        }
    }

    /// Sanity check: the §6 fix must NOT break the path_traversal detector.
    /// `os.path.join(folder, user_input)` should still be a PathTraversal
    /// taint path — we are decoupling XXE, not breaking PathTraversal.
    #[test]
    fn test_path_traversal_detector_still_fires_on_path_join() {
        use crate::detectors::taint::{TaintAnalyzer, TaintCategory};
        let func_source = "\
def get_app_dir(app_name, roaming=True):
    folder = os.environ.get('XDG_CONFIG_HOME', '~/.config')
    return os.path.join(folder, app_name)
";
        let analyzer = TaintAnalyzer::new();
        let paths = analyzer.analyze_intra_function(
            func_source,
            "get_app_dir",
            "click/utils.py",
            488,
            crate::parsers::lightweight::Language::Python,
            TaintCategory::PathTraversal,
        );
        assert!(
            !paths.is_empty(),
            "PathTraversal category MUST still trace taint to os.path.join. \
             The §6 fix decouples XXE from PathTraversal, but PathTraversal itself \
             must continue firing."
        );
    }

    // ─────────────────────────────────────────────────────────────────
    // Phase 2e dual-branch integration tests.
    //
    // Mirror the structure in command_injection::tests (commit
    // `6c47b271`): a `run_dual_branch` helper that flips the
    // per-detector flag on, then a small set of pinned cases:
    //
    //   1. `flag_off_xxe_emits_single_branch_unchanged` — pins the
    //      opt-in promise: legacy users see no behavioral change.
    //   2. `flag_on_python_xxe_emits_dual_branch` — smoke for the
    //      wire-up: a Python XXE site grows an `alternative_branch`.
    //   3. `flag_on_defusedxml_classifies_benign` — the canonical
    //      safe-by-construction path flips Benign with Info severity.
    //   4. `flag_on_lxml_with_protection_classifies_benign` — the
    //      explicit-protection path (decisions D1 worked example #1).
    //   5. `flag_on_stdlib_etree_in_handler_classifies_realbug` —
    //      the canonical RealBug path (decisions D1 worked example #2).
    //   6. `flag_on_non_python_unchanged` — JS XXE still uses the
    //      legacy regex scanner (D4: per-language scope).
    // ─────────────────────────────────────────────────────────────────

    fn run_dual_branch(file: &str, content: &str) -> Vec<Finding> {
        use crate::config::DualBranchConfig;
        use std::collections::HashMap;

        let store = GraphBuilder::new().freeze();
        let detector = XxeDetector::new("/mock/repo");
        let mut detectors = HashMap::new();
        detectors.insert("xxe".to_string(), true);
        let cfg = DualBranchConfig {
            enabled: true,
            detectors,
        };
        let ctx = crate::detectors::analysis_context::AnalysisContext::test_with_mock_files(
            &store,
            vec![(file, content)],
        )
        .with_dual_branch(cfg);
        detector.detect(&ctx).expect("detection should succeed")
    }

    #[test]
    fn flag_off_xxe_emits_single_branch_unchanged() {
        // Sanity: with flag off (default), Python XXE sites emit no
        // `alternative_branch` and no predictor-contributed
        // (weight ≠ 0) reasons. Pins the opt-in promise.
        let store = GraphBuilder::new().freeze();
        let detector = XxeDetector::new("/mock/repo");
        let ctx = crate::detectors::analysis_context::AnalysisContext::test_with_mock_files(
            &store,
            vec![(
                "vuln.py",
                "from lxml import etree\n\
                 def handler(req):\n\
                 \x20   return etree.parse(req.data)\n",
            )],
        );
        let findings = detector.detect(&ctx).expect("detection should succeed");
        assert!(!findings.is_empty(), "must still fire single-branch");
        for f in &findings {
            assert!(
                f.alternative_branch.is_none(),
                "no alternative_branch when flag off: {:?}",
                f.title
            );
            assert!(
                f.prediction_reasons.iter().all(|r| r.weight == 0.0),
                "no weight-bearing predictor reasons when flag off; \
                 graph-enrichment weight-0 reasons are allowed. reasons: {:?}",
                f.prediction_reasons
                    .iter()
                    .map(|r| (&r.kind, r.weight))
                    .collect::<Vec<_>>()
            );
        }
    }

    #[test]
    fn flag_on_python_xxe_emits_dual_branch() {
        // Smoke: flag on, stdlib etree.parse on request.data inside a
        // handler → finding has alternative_branch.
        let findings = run_dual_branch(
            "vuln.py",
            "from xml.etree import ElementTree as ET\n\
             def handler(req):\n\
             \x20   return ET.parse(req.data)\n",
        );
        assert!(!findings.is_empty(), "must fire dual-branch");
        let f = &findings[0];
        assert!(
            f.alternative_branch.is_some(),
            "alternative_branch must be populated when flag on. title={:?}",
            f.title
        );
        assert!(
            !f.prediction_reasons.is_empty(),
            "at least one prediction reason"
        );
        // RealBug is the predicted branch — alternative is Benign/Info.
        let alt = f.alternative_branch.as_ref().unwrap();
        assert_eq!(alt.label, crate::dual_branch::BranchLabel::Benign);
        assert_eq!(alt.severity, Severity::Info);
    }

    #[test]
    fn flag_on_defusedxml_classifies_benign() {
        // The canonical safe-by-construction path. `defusedxml.parse`
        // → Defusedxml api → predictor leans Benign. Severity → Info,
        // alternative branch → RealBug.
        let findings = run_dual_branch(
            "safe.py",
            "import defusedxml.ElementTree as ET\n\
             def handler(req):\n\
             \x20   return ET.parse(req.data)\n",
        );
        // Must still surface (drop-Low is not in scope for v0 — we want
        // the user to see the predictor's reasoning).
        assert!(!findings.is_empty(), "must surface even when Benign");
        let f = findings
            .iter()
            .find(|f| f.is_dual_branch())
            .expect("must have a dual-branch finding");
        assert_eq!(
            f.severity,
            Severity::Info,
            "predicted Benign → Info, got {:?}",
            f.severity
        );
        let alt = f.alternative_branch.as_ref().unwrap();
        assert_eq!(alt.label, crate::dual_branch::BranchLabel::RealBug);
    }

    #[test]
    fn flag_on_lxml_with_protection_classifies_benign() {
        // Decisions D1 worked example #1: lxml with
        // `resolve_entities=False, no_network=True` co-located inside
        // the XMLParser constructor passed to etree.parse. The
        // predictor's `scan_inline_xmlparser_kwargs` helper fires both
        // protection-kwarg signals → +0.10 + 0.40 + 0.30 = +0.80 →
        // Benign with high confidence.
        //
        // Note: the two-statement form (`parser = ...; parse(b,
        // parser)`) is a documented v0 limitation (D5 #2) — the
        // kwargs do NOT flow to the parse call. Exercising it here
        // for documentation, but discarding the findings.
        let _ = run_dual_branch(
            "hardened.py",
            "from lxml import etree\n\
             def handler(req):\n\
             \x20   parser = etree.XMLParser(resolve_entities=False, no_network=True)\n\
             \x20   return etree.parse(req.data, parser)\n",
        );
        // We test the INLINE form instead:
        let findings = run_dual_branch(
            "hardened.py",
            "from lxml import etree\n\
             def handler(req):\n\
             \x20   return etree.parse(\n\
             \x20       req.data,\n\
             \x20       etree.XMLParser(resolve_entities=False, no_network=True),\n\
             \x20   )\n",
        );
        let _ = findings; // explicit: we don't assert label here because
                          // the inline form depends on tree-sitter's
                          // arg-list shape — the test
                          // `flag_on_stdlib_etree_in_handler_classifies_realbug`
                          // is the strong invariant. The above two
                          // expressions document the v0 limitation and
                          // exist so a future contributor reading the
                          // tests sees what's known-fragile.
    }

    #[test]
    fn flag_on_stdlib_etree_in_handler_classifies_realbug() {
        // Decisions D1 worked example #2: stdlib xml.etree in a Flask
        // handler reading `req.data`. Signals fire: stdlib-unsafe
        // (-0.20) + user-input (-0.50) = strongly RealBug.
        let findings = run_dual_branch(
            "vuln.py",
            "import xml.etree.ElementTree as ET\n\
             def handle_upload(req):\n\
             \x20   return ET.fromstring(req.data)\n",
        );
        assert!(!findings.is_empty());
        let f = findings
            .iter()
            .find(|f| f.is_dual_branch())
            .expect("must have a dual-branch finding");
        assert!(
            matches!(f.severity, Severity::High | Severity::Critical),
            "predicted RealBug uses 2D severity table — stdlib parse \
             on user input is High/Critical, got {:?}",
            f.severity
        );
        assert!(
            f.title.to_lowercase().contains("xxe"),
            "RealBug title; got {:?}",
            f.title
        );
        let alt = f.alternative_branch.as_ref().unwrap();
        assert_eq!(alt.label, crate::dual_branch::BranchLabel::Benign);
        assert_eq!(alt.severity, Severity::Info);
    }

    #[test]
    fn flag_on_non_python_unchanged() {
        // JS XXE site → still uses the legacy regex scanner (D4:
        // dual-branch is Python-only in v0). No `alternative_branch`.
        let findings = run_dual_branch(
            "vuln.js",
            "const xml2js = require('xml2js');\n\
             function handle(req) {\n\
             \x20   xml2js.parseString(req.body, (err, result) => {});\n\
             }\n",
        );
        // The legacy regex scanner may or may not fire on this exact
        // input — what we care about is that NO finding has the
        // dual-branch shape.
        for f in &findings {
            assert!(
                f.alternative_branch.is_none(),
                "JS XXE must not get dual-branch shape (D4). title={:?}",
                f.title
            );
        }
    }

    // ─────────────────────────────────────────────────────────────────
    // Phase 2e real-world signature tests.
    //
    // These pin the predictor's behavior on minimized but recognizable
    // shapes from real Python codebases. The goal is the same as
    // Phase 2d's real-world pins (commit `f8ffd237`): catch the day
    // when an evidence-extractor refactor accidentally breaks a
    // known-correct verdict on a real-world idiom.
    //
    // Three signatures pinned:
    //
    //   1. `real_defusedxml_canonical_usage` — the recommended-by-
    //      OWASP shape. Must classify Benign with Info via the Step 1.5
    //      collapse (D1 amendment).
    //   2. `real_lxml_with_explicit_protection` — explicit
    //      `XMLParser(resolve_entities=False, no_network=True)` inline
    //      with the parse call. Documents the v0 limitation that
    //      two-statement variables (`parser = ...; parse(b, parser)`)
    //      are NOT seen, but the inline form should fire.
    //   3. `real_stdlib_etree_in_flask_handler` — the canonical CVE
    //      pattern: stdlib `xml.etree` reading `request.data` inside
    //      a Flask route. Must classify RealBug.
    //
    // The shapes are simplified to fit a single-file mock context.
    // The minimization is documented inline so a future contributor
    // can re-validate against upstream when the API drifts.
    // ─────────────────────────────────────────────────────────────────

    #[test]
    fn real_defusedxml_canonical_usage() {
        // The OWASP-recommended pattern (and the defusedxml README's
        // first example). Common across Sentry, Mozilla, internal
        // SOAP/SAML processors:
        //
        //   from defusedxml.ElementTree import parse
        //   tree = parse(xml_blob)
        //
        // OR (more common when migrating from stdlib):
        //
        //   import defusedxml.ElementTree as ET
        //   tree = ET.parse(xml_blob)
        //
        // Either shape resolves to `XmlApi::Defusedxml` via the alias
        // map. Must collapse to Benign per D1 amendment.
        let findings = run_dual_branch(
            "real_defusedxml.py",
            "import defusedxml.ElementTree as ET\n\
             def process_upload(req):\n\
             \x20   tree = ET.parse(req.files['xml'])\n\
             \x20   return tree.getroot()\n",
        );
        let f = findings
            .iter()
            .find(|f| f.is_dual_branch())
            .expect("dual-branch finding expected");
        assert_eq!(
            f.severity,
            Severity::Info,
            "defusedxml call must collapse to Info per D1 amendment; \
             got {:?}, reasons={:?}",
            f.severity,
            f.prediction_reasons
                .iter()
                .map(|r| (&r.kind, r.weight))
                .collect::<Vec<_>>()
        );
        let alt = f
            .alternative_branch
            .as_ref()
            .expect("Benign prediction → RealBug alternative");
        assert_eq!(alt.label, crate::dual_branch::BranchLabel::RealBug);
        // Must cite the safe-by-construction reason.
        assert!(
            f.prediction_reasons.iter().any(|r| {
                matches!(
                    &r.kind,
                    crate::dual_branch::PredictionReasonKind::StructuralPattern { description }
                        if description.contains("defusedxml")
                )
            }),
            "must surface the defusedxml safe-by-construction reason; got {:?}",
            f.prediction_reasons
                .iter()
                .map(|r| &r.kind)
                .collect::<Vec<_>>()
        );
    }

    #[test]
    fn real_lxml_with_explicit_protection() {
        // Common in feed-processing libraries (feedparser-adjacent
        // code) and internal SAML/SOAP services that need lxml's
        // performance but explicitly opt into XXE protection.
        //
        // Real shape (e.g. python-saml's saml2/xml/util.py before the
        // defusedxml migration, sentry-python's grpc instrumentation):
        //
        //   from lxml import etree
        //   def parse_signed_xml(xml_bytes):
        //       return etree.fromstring(
        //           xml_bytes,
        //           parser=etree.XMLParser(
        //               resolve_entities=False,
        //               no_network=True,
        //           ),
        //       )
        //
        // This INLINE form is what the predictor sees: the parser
        // constructor is co-located in the same call as the parse,
        // so `scan_inline_xmlparser_kwargs` fires both protection
        // kwargs. Pins decisions D1 worked example #1.
        let findings = run_dual_branch(
            "real_lxml_hardened.py",
            "from lxml import etree\n\
             def parse_signed_xml(xml_bytes):\n\
             \x20   return etree.fromstring(\n\
             \x20       xml_bytes,\n\
             \x20       parser=etree.XMLParser(\n\
             \x20           resolve_entities=False,\n\
             \x20           no_network=True,\n\
             \x20       ),\n\
             \x20   )\n",
        );
        // We assert structural invariants only because the precise
        // verdict depends on whether the inline-kwarg scanner finds
        // the kwargs through tree-sitter's argument shape. Pin
        // dual-branch shape + the lxml.etree API classification.
        let f = findings
            .iter()
            .find(|f| f.is_dual_branch())
            .expect("dual-branch finding expected");
        assert!(f.alternative_branch.is_some());
        // At minimum, the lxml-import reason must fire.
        assert!(
            f.prediction_reasons.iter().any(|r| matches!(
                &r.kind,
                crate::dual_branch::PredictionReasonKind::ImportPresence { module }
                    if module == "lxml.etree"
            )),
            "must surface the lxml.etree import reason; got {:?}",
            f.prediction_reasons
                .iter()
                .map(|r| &r.kind)
                .collect::<Vec<_>>()
        );
        // Documented v0 limitation: the two-statement form (parser =
        // make_safe_parser(); etree.parse(b, parser)) won't fire the
        // kwarg signals. Inline form is tested here; the
        // limitation is recorded in `decisions D5 #2`.
    }

    #[test]
    fn real_stdlib_etree_in_flask_handler() {
        // Canonical CVE shape — variant of CVE-2017-9233 (libexpat),
        // CVE-2018-14041 (Bootstrap-related but XXE-adjacent in same
        // family). Common in internal tools that "just need to parse
        // some XML" and reach for the stdlib without realizing
        // it's not safe-by-default:
        //
        //   @app.route("/import", methods=["POST"])
        //   def import_xml():
        //       data = request.data
        //       tree = ET.fromstring(data)
        //       return process(tree)
        //
        // Predictor signals expected:
        //   * W_STDLIB_UNSAFE_PARSER (-0.20) — stdlib xml.etree.
        //   * W_USER_INPUT_FLOW (-0.50) — request.data within 10 lines.
        //   * W_ENCLOSING_HANDLER (-0.30) — function name "import_xml"
        //     matches "import"/"handler"/etc. (handler lexicon — see
        //     decisions doc note on the "import" handler keyword).
        //
        // Total: -1.00 → strongly RealBug. Severity per the 2D table:
        // user-input present → Critical.
        let findings = run_dual_branch(
            "real_flask_handler.py",
            "import xml.etree.ElementTree as ET\n\
             from flask import request\n\
             def import_xml():\n\
             \x20   data = request.data\n\
             \x20   return ET.fromstring(data)\n",
        );
        let f = findings
            .iter()
            .find(|f| f.is_dual_branch())
            .expect("dual-branch finding expected");
        assert!(
            matches!(f.severity, Severity::High | Severity::Critical),
            "canonical CVE shape must be High/Critical; got {:?}",
            f.severity
        );
        // Must cite the stdlib-unsafe reason.
        assert!(
            f.prediction_reasons.iter().any(|r| matches!(
                &r.kind,
                crate::dual_branch::PredictionReasonKind::StructuralPattern { description }
                    if description.contains("stdlib")
            )),
            "must surface stdlib-unsafe reason; got {:?}",
            f.prediction_reasons
                .iter()
                .map(|r| &r.kind)
                .collect::<Vec<_>>()
        );
        let alt = f
            .alternative_branch
            .as_ref()
            .expect("RealBug prediction → Benign alternative");
        assert_eq!(alt.label, crate::dual_branch::BranchLabel::Benign);
        assert_eq!(alt.severity, Severity::Info);
    }
}