repotoire 0.8.2

//! SSRF Detector

// Phase 2f dual-branch submodules.
mod annotation;
mod evidence;
mod predict;

use crate::detectors::base::{Detector, DetectorConfig};
use crate::detectors::fast_search::{find_in, *};
use crate::detectors::taint::{TaintAnalysisResult, TaintAnalyzer, TaintCategory};
use crate::models::{deterministic_finding_id, Finding, Severity};
use anyhow::Result;
use regex::Regex;
use std::path::{Path, PathBuf};
use std::sync::LazyLock;

static HTTP_CLIENT: LazyLock<Regex> = LazyLock::new(|| {
    Regex::new(r"(?i)(requests\.(get|post|put|delete)|fetch\(|axios\.|http\.get|urllib|urlopen|HttpClient|curl)").expect("valid regex")
});

pub struct SsrfDetector {
    repository_path: PathBuf,
    max_findings: usize,
    taint_analyzer: TaintAnalyzer,
    precomputed_cross: std::sync::OnceLock<Vec<crate::detectors::taint::TaintPath>>,
    precomputed_intra: std::sync::OnceLock<Vec<crate::detectors::taint::TaintPath>>,
}

impl SsrfDetector {
    pub fn new(repository_path: impl Into<PathBuf>) -> Self {
        Self {
            repository_path: repository_path.into(),
            max_findings: 50,
            taint_analyzer: TaintAnalyzer::new(),
            precomputed_cross: std::sync::OnceLock::new(),
            precomputed_intra: std::sync::OnceLock::new(),
        }
    }

    /// Phase 2f dual-branch scan path (Python only).
    ///
    /// Parses the file once, collects every HTTP-client call site via
    /// [`evidence::collect_python_http_sites`], extracts evidence,
    /// runs [`predict::predict`], and builds a dual-branch finding
    /// per site. Replaces the legacy line-regex pass for `.py` files
    /// when the `ssrf` dual-branch flag is on.
    ///
    /// Returns an empty vec if the file has no HTTP-client imports
    /// (fast path inside the collector) or fails to parse. Honors
    /// `# repotoire: ignore` suppressions on the call line.
    fn scan_python_file_dual_branch(&self, path: &Path, content: &str) -> Vec<Finding> {
        if content.contains('\0') {
            return Vec::new();
        }
        let Some(tree) = crate::detectors::ast_fingerprint::parse_root_ext(
            content,
            crate::parsers::lightweight::Language::Python,
            "py",
        ) else {
            return Vec::new();
        };
        let root = tree.root_node();
        let source = content.as_bytes();
        let lines: Vec<&str> = content.lines().collect();

        let mut findings = Vec::new();
        for site in evidence::collect_python_http_sites(root, source) {
            let line_idx = site.call_node.start_position().row;

            // Honor `# repotoire: ignore` / inline suppressions same
            // as the legacy path. Without this, users who suppressed
            // the legacy finding would see a new dual-branch finding
            // appear when they flip the flag on — a regression.
            if let Some(line) = lines.get(line_idx) {
                let prev = if line_idx > 0 {
                    Some(lines[line_idx - 1])
                } else {
                    None
                };
                if crate::detectors::is_line_suppressed(line, prev) {
                    continue;
                }
            }

            let snippet = lines.get(line_idx).map(|s| s.trim()).unwrap_or("");
            let line_num = (line_idx + 1) as u32;

            findings.push(self.build_dual_branch_ssrf_finding(
                path,
                line_num,
                site.api,
                snippet,
                site.call_node,
                root,
                source,
                &lines,
            ));
        }
        findings
    }

    /// Build a dual-branch Finding for a single Python SSRF call site.
    ///
    /// Mirrors `xxe::build_dual_branch_xxe_finding` (Phase 2e): pull
    /// evidence, run the predictor, pick a title/description/fix per
    /// branch label, attach the alternative branch + every prediction
    /// reason + every resolution signal. The result is a single
    /// `Finding` with the dual-branch shape that
    /// `--show-alternatives` knows how to render.
    fn build_dual_branch_ssrf_finding(
        &self,
        path: &Path,
        line_num: u32,
        api: predict::HttpApi,
        snippet: &str,
        call_node: tree_sitter::Node<'_>,
        module_root: tree_sitter::Node<'_>,
        source: &[u8],
        lines: &[&str],
    ) -> Finding {
        let api_label = api.callee_label();
        let mut ev = evidence::extract_python_evidence(call_node, module_root, source, lines);
        ev.api = Some(api);
        let prediction = predict::predict(&ev);

        let predicted_label = prediction.predicted;
        let predicted_severity = prediction.predicted_severity;
        let predicted_title = match predicted_label {
            crate::dual_branch::BranchLabel::RealBug => {
                format!("Potential SSRF via {api_label}")
            }
            crate::dual_branch::BranchLabel::Benign => {
                format!("Hardened HTTP request via {api_label} (informational)")
            }
        };
        let predicted_description = format!(
            "**Server-Side Request Forgery (dual-branch, CWE-918)**\n\n\
             **API**: `{}`\n\n\
             **Location**: {}:{}\n\n\
             **Code**:\n```python\n{}\n```\n\n\
             {}",
            api_label,
            path.display(),
            line_num,
            snippet,
            match predicted_label {
                crate::dual_branch::BranchLabel::RealBug => format!(
                    "The `{api_label}` call site shows evidence of attacker-\
                     influenceable URL input (request body / query / params \
                     within a handler-like scope) without a co-located \
                     allowlist check, scheme/hostname filter, or private-IP \
                     guard. The predictor leans RealBug for this call site \
                     (see `prediction_reasons`)."
                ),
                crate::dual_branch::BranchLabel::Benign => format!(
                    "The `{api_label}` call site uses a safe-by-default \
                     transport (advocate) and/or co-locates an allowlist \
                     check, scheme/hostname filter, or private-IP guard \
                     with the URL argument. The predictor leans Benign \
                     (see `prediction_reasons`); the original \
                     `severity_for`-table interpretation is carried in \
                     `alternative_branch`."
                ),
            },
        );
        let predicted_fix = match predicted_label {
            crate::dual_branch::BranchLabel::RealBug => Some(
                "Validate the URL against an allowlist of trusted hosts and \
                 reject private/loopback IPs before making the request. For \
                 Python, prefer the `advocate` library (drop-in \
                 `requests`-compatible HTTP client that blocks SSRF at the \
                 transport layer). If this is a false positive (the URL \
                 source is trusted via a path the v0 predictor doesn't see \
                 — e.g. a value loaded from config), annotate the call site \
                 with `# repotoire: ssrf-safe[<reason>]` to collapse the \
                 finding to Info."
                    .to_string(),
            ),
            crate::dual_branch::BranchLabel::Benign => Some(
                "If this request IS attacker-reachable (the alternative \
                 branch), verify the allowlist body rejects internal \
                 hosts and private IPs, OR switch to `advocate` which \
                 enforces the policy at the socket layer. If the \
                 predictor is correct that this call is hardened, no \
                 action is needed."
                    .to_string(),
            ),
        };

        let mut finding = Finding {
            id: String::new(),
            detector: "SsrfDetector".to_string(),
            severity: predicted_severity,
            title: predicted_title,
            description: predicted_description,
            affected_files: vec![path.to_path_buf()],
            line_start: Some(line_num),
            line_end: Some(line_num),
            suggested_fix: predicted_fix,
            estimated_effort: Some("45 minutes".to_string()),
            category: Some("security".to_string()),
            cwe_id: Some("CWE-918".to_string()),
            why_it_matters: Some(
                "SSRF vulnerabilities allow attackers to:\n\
                 • Access internal services (cloud metadata, admin panels)\n\
                 • Exfiltrate data from internal networks\n\
                 • Pivot through the server to reach private hosts\n\
                 • Port-scan the internal network"
                    .to_string(),
            ),
            ..Default::default()
        };

        finding = finding.with_alternative_branch(prediction.alternative_branch);
        for reason in prediction.reasons {
            finding = finding.with_prediction_reason(reason);
        }
        for resolution in prediction.resolutions {
            finding = finding.with_resolution_signal(resolution);
        }
        finding
    }
}

impl Detector for SsrfDetector {
    fn name(&self) -> &'static str {
        "ssrf"
    }
    fn description(&self) -> &'static str {
        "Detects SSRF vulnerabilities"
    }

    fn bypass_postprocessor(&self) -> bool {
        true
    }

    crate::detectors::impl_taint_precompute!();

    fn taint_category(&self) -> Option<crate::detectors::taint::TaintCategory> {
        Some(TaintCategory::Ssrf)
    }

    fn file_extensions(&self) -> &'static [&'static str] {
        &["py", "js", "ts", "jsx", "tsx", "rb", "php", "java", "go"]
    }

    fn content_requirements(&self) -> crate::detectors::detector_context::ContentFlags {
        crate::detectors::detector_context::ContentFlags::HAS_HTTP_CLIENT
    }

    fn detect(
        &self,
        ctx: &crate::detectors::analysis_context::AnalysisContext,
    ) -> Result<Vec<Finding>> {
        let graph = ctx.graph;
        let files = &ctx.as_file_provider();
        let mut findings = vec![];

        // Phase 2f dual-branch gate. When `true`, Python `.py` files
        // go through the AST-driven predictor path
        // (`scan_python_file_dual_branch`) and skip the legacy line
        // scanner. Other languages and the flag-off path are
        // unchanged. Symmetric with xxe's `flag_on` (Phase 2e).
        let flag_on = ctx.dual_branch.is_enabled_for("ssrf");

        // Run taint analysis for SSRF (precomputed or fallback)
        let mut taint_paths = if let Some(cross) = self.precomputed_cross.get() {
            cross.clone()
        } else {
            self.taint_analyzer.trace_taint(graph, TaintCategory::Ssrf)
        };
        let intra_paths = if let Some(intra) = self.precomputed_intra.get() {
            intra.clone()
        } else {
            crate::detectors::taint::run_intra_function_taint(
                &self.taint_analyzer,
                graph,
                TaintCategory::Ssrf,
                &self.repository_path,
            )
        };
        taint_paths.extend(intra_paths);
        let taint_result = TaintAnalysisResult::from_paths(taint_paths);

        for path in files
            .files_with_extensions(&["py", "js", "ts", "jsx", "tsx", "rb", "php", "java", "go"])
        {
            if findings.len() >= self.max_findings {
                break;
            }

            let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");

            // Phase 2f: AST-driven predictor path for Python when the
            // dual-branch flag is on. Replaces the legacy regex pass
            // for `.py` files; other languages and the flag-off path
            // fall through to the regex scanner below.
            if flag_on && ext == "py" {
                if let Some(content) = files.content(path) {
                    let dual = self.scan_python_file_dual_branch(path, &content);
                    for finding in dual {
                        findings.push(finding);
                        if findings.len() >= self.max_findings {
                            break;
                        }
                    }
                }
                continue;
            }

            if let Some(content) = files.content(path) {
                let file_str = path.to_string_lossy();
                let lines: Vec<&str> = content.lines().collect();

                for (i, line) in lines.iter().enumerate() {
                    let prev_line = if i > 0 { Some(lines[i - 1]) } else { None };
                    if crate::detectors::is_line_suppressed(line, prev_line) {
                        continue;
                    }

                    if HTTP_CLIENT.is_match(line) {
                        // Skip relative URLs - they always hit same-origin server
                        // Pattern: fetch('/api/...) or fetch(`/api/...)
                        if find_in(&FIND_FETCH_SLASH_SINGLE, line)
                            || find_in(&FIND_FETCH_BACKTICK_SLASH, line)
                            || find_in(&FIND_FETCH_DQUOTE_SLASH, line)
                        {
                            continue;
                        }

                        // Skip config constant URLs (API_URL, BASE_URL, etc.)
                        // These are from env/config, not user input
                        if find_in(&FIND_API_URL, line)
                            || find_in(&FIND_BASE_URL, line)
                            || find_in(&FIND_SERVER_URL, line)
                            || find_in(&FIND_BACKEND_URL, line)
                            || find_in(&FIND_API_URL_CAMEL, line)
                            || find_in(&FIND_BASE_URL_CAMEL, line)
                        {
                            // Additional check: must have interpolation to be potential SSRF
                            // If it's just API_URL + "/path", that's safe
                            let has_dynamic_path = find_in(&FIND_PARAMS, line)
                                || find_in(&FIND_DOT_QUERY, line)
                                || (find_in(&FIND_DOLLAR_BRACE, line)
                                    && !line.contains("${API_URL")
                                    && !line.contains("${BASE_URL")
                                    && !line.contains("${SERVER_URL"));
                            if !has_dynamic_path {
                                continue;
                            }
                        }

                        // Check if the URL source is likely from env/config (safe)
                        // or from user input (dangerous)
                        let is_env_sourced = {
                            // Look at surrounding context (20 lines before) for env var / config patterns
                            let context_start = i.saturating_sub(20);
                            let context = &lines[context_start..=i];
                            let context_str = context.join("\n").to_lowercase();

                            // URL variable comes from environment
                            find_in(&FIND_PROCESS_ENV, &context_str)
                                || context_str.contains("env.get(")
                                || context_str.contains("os.environ")
                                || context_str.contains("std::env")
                                || context_str.contains("config.")
                                || context_str.contains("options.base")
                                || context_str.contains("baseurl")
                                || context_str.contains("base_url")
                            // Function parameter named url/endpoint from config (not user input)
                            // Removed: "input" substring check caused false negatives (#23)
                        };

                        if is_env_sourced {
                            continue;
                        }

                        let has_user_input = find_in(&FIND_REQ_DOT, line)
                            || find_in(&FIND_REQUEST_BODY, line)
                            || find_in(&FIND_REQUEST_QUERY, line)
                            || find_in(&FIND_REQUEST_PARAMS, line)
                            || find_in(&FIND_CTX_PARAMS, line)
                            || find_in(&FIND_CTX_QUERY, line);
                        if has_user_input {
                            let line_num = (i + 1) as u32;

                            // Check taint analysis for this location
                            let matching_taint = taint_result.paths.iter().find(|p| {
                                (p.sink_file == file_str || p.source_file == file_str)
                                    && (p.sink_line == line_num || p.source_line == line_num)
                            });

                            // Adjust severity based on taint analysis
                            let (severity, description) = match matching_taint {
                                Some(taint_path) if taint_path.is_sanitized => {
                                    // Sanitizer found - lower severity
                                    (Severity::Low, format!(
                                        "HTTP request with user-controlled URL.\n\n\
                                         **Taint Analysis Note**: A sanitizer function (`{}`) was found \
                                         in the data flow path, which may mitigate this vulnerability.",
                                        taint_path.sanitizer.as_deref().unwrap_or("unknown")
                                    ))
                                }
                                Some(taint_path) => {
                                    // Unsanitized taint path - critical
                                    (Severity::Critical, format!(
                                        "HTTP request with user-controlled URL.\n\n\
                                         **Taint Analysis Confirmed**: Data flow analysis traced a path \
                                         from user input to this SSRF sink without sanitization:\n\n\
                                         `{}`",
                                        taint_path.path_string()
                                    ))
                                }
                                None => {
                                    // No taint path - use pattern-based severity
                                    (
                                        Severity::High,
                                        "HTTP request with user-controlled URL.".to_string(),
                                    )
                                }
                            };

                            findings.push(Finding {
                                id: String::new(),
                                detector: "SsrfDetector".to_string(),
                                severity,
                                title: "Potential SSRF vulnerability".to_string(),
                                description,
                                affected_files: vec![path.to_path_buf()],
                                line_start: Some(line_num),
                                line_end: Some(line_num),
                                suggested_fix: Some(
                                    "Validate URL against allowlist, block internal IPs."
                                        .to_string(),
                                ),
                                estimated_effort: Some("45 minutes".to_string()),
                                category: Some("security".to_string()),
                                cwe_id: Some("CWE-918".to_string()),
                                why_it_matters: Some(
                                    "Attackers could access internal services.".to_string(),
                                ),
                                ..Default::default()
                            });
                        }
                    }
                }
            }
        }

        // Filter out Low severity (sanitized) findings
        findings.retain(|f| f.severity != Severity::Low);

        Ok(findings)
    }
}

impl crate::detectors::RegisteredDetector for SsrfDetector {
    fn create(init: &crate::detectors::DetectorInit) -> std::sync::Arc<dyn Detector> {
        std::sync::Arc::new(Self::new(init.repo_path))
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::detectors::base::Detector;
    use crate::graph::builder::GraphBuilder;

    #[test]
    fn test_detects_requests_get_with_user_input() {
        let store = GraphBuilder::new().freeze();
        let detector = SsrfDetector::new("/mock/repo");
        let ctx = crate::detectors::analysis_context::AnalysisContext::test_with_mock_files(&store, vec![
            ("vuln.py", "import requests\n\ndef fetch_url(req):\n    url = req.body.get(\"url\")\n    response = requests.get(req.body[\"url\"])\n    return response.text\n"),
        ]);
        let findings = detector.detect(&ctx).expect("detection should succeed");
        assert!(
            !findings.is_empty(),
            "Should detect requests.get with user-controlled URL from req.body"
        );
        assert!(
            findings.iter().any(|f| f.title.contains("SSRF")),
            "Finding should mention SSRF. Titles: {:?}",
            findings.iter().map(|f| &f.title).collect::<Vec<_>>()
        );
        assert!(
            findings
                .iter()
                .any(|f| f.cwe_id.as_deref() == Some("CWE-918")),
            "Finding should have CWE-918"
        );
    }

    #[test]
    fn test_no_findings_for_hardcoded_url() {
        let store = GraphBuilder::new().freeze();
        let detector = SsrfDetector::new("/mock/repo");
        let ctx = crate::detectors::analysis_context::AnalysisContext::test_with_mock_files(&store, vec![
            ("safe.py", "import requests\n\ndef fetch_data():\n    response = requests.get(\"https://api.example.com/data\")\n    return response.json()\n"),
        ]);
        let findings = detector.detect(&ctx).expect("detection should succeed");
        assert!(
            findings.is_empty(),
            "Hardcoded URL should have no SSRF findings, but got: {:?}",
            findings.iter().map(|f| &f.title).collect::<Vec<_>>()
        );
    }

    #[test]
    fn test_detects_fetch_with_user_input_in_js() {
        let store = GraphBuilder::new().freeze();
        let detector = SsrfDetector::new("/mock/repo");
        let ctx = crate::detectors::analysis_context::AnalysisContext::test_with_mock_files(&store, vec![
            ("proxy.js", "async function proxyRequest(req, res) {\n    const targetUrl = req.body.url;\n    const response = await fetch(req.body.url);\n    const data = await response.json();\n    res.json(data);\n}\n"),
        ]);
        let findings = detector.detect(&ctx).expect("detection should succeed");
        assert!(
            !findings.is_empty(),
            "Should detect fetch() with user-controlled URL from req.body"
        );
        assert!(
            findings
                .iter()
                .any(|f| f.cwe_id.as_deref() == Some("CWE-918")),
            "Finding should have CWE-918"
        );
    }

    #[test]
    fn test_detects_urllib_with_user_input_in_python() {
        let store = GraphBuilder::new().freeze();
        let detector = SsrfDetector::new("/mock/repo");
        let ctx = crate::detectors::analysis_context::AnalysisContext::test_with_mock_files(&store, vec![
            ("handler.py", "from urllib.request import urlopen\n\ndef fetch(request):\n    url = request.query.get('target')\n    response = urlopen(request.query['target'])\n    return response.read()\n"),
        ]);
        let findings = detector.detect(&ctx).expect("detection should succeed");
        assert!(
            !findings.is_empty(),
            "Should detect urlopen with user-controlled URL from request.query"
        );
        assert!(
            findings.iter().any(|f| f.title.contains("SSRF")),
            "Finding should mention SSRF"
        );
    }

    #[test]
    fn test_no_finding_for_env_sourced_url() {
        let store = GraphBuilder::new().freeze();
        let detector = SsrfDetector::new("/mock/repo");
        let ctx = crate::detectors::analysis_context::AnalysisContext::test_with_mock_files(&store, vec![
            ("client.py", "import os\nimport requests\n\ndef call_api():\n    base = os.environ.get('API_HOST')\n    response = requests.get(base + '/health')\n    return response.status_code\n"),
        ]);
        let findings = detector.detect(&ctx).expect("detection should succeed");
        assert!(
            findings.is_empty(),
            "URL sourced from environment variable should not trigger SSRF, but got: {:?}",
            findings.iter().map(|f| &f.title).collect::<Vec<_>>()
        );
    }

    #[test]
    fn test_no_finding_for_relative_fetch() {
        let store = GraphBuilder::new().freeze();
        let detector = SsrfDetector::new("/mock/repo");
        let ctx = crate::detectors::analysis_context::AnalysisContext::test_with_mock_files(&store, vec![
            ("api.js", "async function loadData(req, res) {\n    const data = await fetch('/api/users');\n    res.json(await data.json());\n}\n"),
        ]);
        let findings = detector.detect(&ctx).expect("detection should succeed");
        assert!(
            findings.is_empty(),
            "Relative URL fetch should not trigger SSRF, but got: {:?}",
            findings.iter().map(|f| &f.title).collect::<Vec<_>>()
        );
    }

    // ─────────────────────────────────────────────────────────────────
    // Phase 2f dual-branch integration tests.
    //
    // Mirror the structure in xxe::tests (Phase 2e): a
    // `run_dual_branch` helper that flips the per-detector flag on,
    // then a pinned set of cases:
    //
    //   1. `flag_off_ssrf_emits_single_branch_unchanged` — pins the
    //      opt-in promise: legacy users see no behavioral change.
    //   2. `flag_on_python_ssrf_emits_dual_branch` — smoke for the
    //      wire-up: a Python SSRF site grows an `alternative_branch`.
    //   3. `flag_on_advocate_classifies_benign` — the canonical
    //      safe-by-construction Advocate Step 1.5 collapse (D1
    //      amendment).
    //   4. `flag_on_requests_in_handler_classifies_realbug` — the
    //      canonical RealBug path (decisions D1 worked example).
    //   5. `flag_on_allowlist_plus_private_ip_guard_classifies_benign`
    //      — the additive-collapse path: two strong positive signals
    //      together (allowlist + private-IP guard) overcome any
    //      negatives. Pinned by predict.rs test
    //      `allowlist_plus_private_ip_guard_predicts_benign`.
    //   6. `flag_on_non_python_unchanged` — JS SSRF still uses the
    //      legacy regex scanner (D4: per-language scope).
    //   7. `flag_on_ssrf_safe_annotation_collapses_to_info` — the
    //      escape hatch (Step 1 hard collapse on user annotation).
    //
    // Honest review note (2026-05-09): D1 includes the Advocate
    // safe-by-construction collapse — when the call site is on
    // `advocate.*` (transport-layer enforcement that blocks all
    // private IPs at the socket layer), the predictor short-circuits
    // to Benign/Info regardless of other signals. This mirrors the
    // 2e defusedxml collapse and is the principled asymmetry in
    // decisions doc §6: Advocate collapses (transport guarantee),
    // allowlist calls stay additive (presence ≠ correctness).
    // ─────────────────────────────────────────────────────────────────

    fn run_dual_branch(file: &str, content: &str) -> Vec<Finding> {
        use crate::config::DualBranchConfig;
        use std::collections::HashMap;

        let store = GraphBuilder::new().freeze();
        let detector = SsrfDetector::new("/mock/repo");
        let mut detectors = HashMap::new();
        detectors.insert("ssrf".to_string(), true);
        let cfg = DualBranchConfig {
            enabled: true,
            detectors,
        };
        let ctx = crate::detectors::analysis_context::AnalysisContext::test_with_mock_files(
            &store,
            vec![(file, content)],
        )
        .with_dual_branch(cfg);
        detector.detect(&ctx).expect("detection should succeed")
    }

    #[test]
    fn flag_off_ssrf_emits_single_branch_unchanged() {
        // Sanity: with flag off (default), Python SSRF sites emit no
        // `alternative_branch` and no predictor-contributed
        // (weight ≠ 0) reasons. Pins the opt-in promise.
        let store = GraphBuilder::new().freeze();
        let detector = SsrfDetector::new("/mock/repo");
        let ctx = crate::detectors::analysis_context::AnalysisContext::test_with_mock_files(
            &store,
            vec![(
                "vuln.py",
                "import requests\n\
                 def handler(req):\n\
                 \x20   return requests.get(req.body['url'])\n",
            )],
        );
        let findings = detector.detect(&ctx).expect("detection should succeed");
        assert!(!findings.is_empty(), "must still fire single-branch");
        for f in &findings {
            assert!(
                f.alternative_branch.is_none(),
                "no alternative_branch when flag off: {:?}",
                f.title
            );
            assert!(
                f.prediction_reasons.iter().all(|r| r.weight == 0.0),
                "no weight-bearing predictor reasons when flag off; \
                 graph-enrichment weight-0 reasons are allowed. reasons: {:?}",
                f.prediction_reasons
                    .iter()
                    .map(|r| (&r.kind, r.weight))
                    .collect::<Vec<_>>()
            );
        }
    }

    #[test]
    fn flag_on_python_ssrf_emits_dual_branch() {
        // Smoke: flag on, `requests.get(req.body[...])` in a handler
        // → finding has alternative_branch and at least one
        // prediction reason.
        let findings = run_dual_branch(
            "vuln.py",
            "import requests\n\
             def handler(req):\n\
             \x20   return requests.get(req.body['url'])\n",
        );
        assert!(!findings.is_empty(), "must fire dual-branch");
        let f = &findings[0];
        assert!(
            f.alternative_branch.is_some(),
            "alternative_branch must be populated when flag on. title={:?}",
            f.title
        );
        assert!(
            !f.prediction_reasons.is_empty(),
            "at least one prediction reason"
        );
        // RealBug is the predicted branch — alternative is Benign/Info.
        let alt = f.alternative_branch.as_ref().unwrap();
        assert_eq!(alt.label, crate::dual_branch::BranchLabel::Benign);
        assert_eq!(alt.severity, Severity::Info);
    }

    #[test]
    fn flag_on_advocate_classifies_benign() {
        // D1 Advocate Step 1.5 collapse: when the call is on
        // `advocate.*`, the predictor short-circuits to Benign/Info
        // regardless of user-input signals — advocate enforces
        // private-IP blocking at the socket layer. Mirrors 2e
        // defusedxml collapse.
        let findings = run_dual_branch(
            "safe.py",
            "import advocate\n\
             def handler(req):\n\
             \x20   return advocate.get(req.body['url'])\n",
        );
        assert!(!findings.is_empty(), "must surface even when Benign");
        let f = findings
            .iter()
            .find(|f| f.is_dual_branch())
            .expect("must have a dual-branch finding");
        assert_eq!(
            f.severity,
            Severity::Info,
            "predicted Benign via Advocate collapse → Info, got {:?}",
            f.severity
        );
        let alt = f.alternative_branch.as_ref().unwrap();
        assert_eq!(alt.label, crate::dual_branch::BranchLabel::RealBug);
    }

    #[test]
    fn flag_on_requests_in_handler_classifies_realbug() {
        // Decisions D1 worked example: requests.get on req.body['url']
        // inside a Flask-shaped handler. user_input (-0.50) +
        // handler (-0.30) → strongly RealBug.
        let findings = run_dual_branch(
            "vuln.py",
            "import requests\n\
             def handle_proxy(req):\n\
             \x20   return requests.get(req.body['url'])\n",
        );
        assert!(!findings.is_empty());
        let f = findings
            .iter()
            .find(|f| f.is_dual_branch())
            .expect("must have a dual-branch finding");
        assert!(
            matches!(f.severity, Severity::High | Severity::Critical),
            "predicted RealBug uses 2D severity table — requests \
             on user input in a handler is High/Critical, got {:?}",
            f.severity
        );
        let alt = f.alternative_branch.as_ref().unwrap();
        assert_eq!(alt.label, crate::dual_branch::BranchLabel::Benign);
    }

    #[test]
    fn flag_on_allowlist_plus_private_ip_guard_classifies_benign() {
        // Allowlist + private-IP guard collapse: when an explicit
        // validator AND a private-IP guard are co-located, the +0.40
        // and +0.30 additive signals dominate any negative signals
        // and flip the label to Benign. Additive (not safe-by-
        // construction) — we trust *presence* not correctness, but
        // two strong positive signals together is the strongest
        // additive evidence we accept. Pinned in predict.rs test
        // `allowlist_plus_private_ip_guard_predicts_benign`.
        //
        // Without the private-IP guard, allowlist + user_input +
        // handler stays RealBug (decisions doc §6: principled
        // asymmetry — allowlist alone does not overcome user-input
        // flow in a handler context).
        let findings = run_dual_branch(
            "guarded.py",
            "import ipaddress\n\
             import requests\n\
             def fetch_external(host_arg, url):\n\
             \x20   if not is_safe_url(url):\n\
             \x20       raise ValueError('blocked')\n\
             \x20   if ipaddress.ip_address(host_arg).is_private:\n\
             \x20       raise ValueError('blocked')\n\
             \x20   return requests.get(url)\n",
        );
        assert!(!findings.is_empty(), "must surface");
        let f = findings
            .iter()
            .find(|f| f.is_dual_branch())
            .expect("must have a dual-branch finding");
        assert_eq!(
            f.severity,
            Severity::Info,
            "allowlist + private-IP guard → Benign/Info, got {:?}",
            f.severity
        );
    }

    #[test]
    fn flag_on_non_python_unchanged() {
        // D4: per-language scope. The dual-branch predictor is
        // Python-only in v0. JS SSRF still flows through the legacy
        // regex scanner — same shape as flag-off.
        let findings = run_dual_branch(
            "proxy.js",
            "async function proxy(req, res) {\n\
            \x20   const u = req.body.url;\n\
            \x20   return fetch(req.body.url);\n\
             }\n",
        );
        assert!(!findings.is_empty(), "JS SSRF must still fire");
        for f in &findings {
            assert!(
                f.alternative_branch.is_none(),
                "JS findings stay single-branch in Phase 2f. title={:?}",
                f.title
            );
        }
    }

    #[test]
    fn flag_on_ssrf_safe_annotation_collapses_to_info() {
        // Escape hatch: `# repotoire: ssrf-safe[<reason>]` on the
        // call line collapses the finding to Benign/Info regardless
        // of other signals. Mirrors 2e's xxe-safe annotation.
        let findings = run_dual_branch(
            "explicit.py",
            "import requests\n\
             def handler(req):\n\
             \x20   url = req.body['url']\n\
             \x20   return requests.get(url)  # repotoire: ssrf-safe[validated-by-cdn]\n",
        );
        assert!(!findings.is_empty(), "annotated site must still surface");
        let f = findings
            .iter()
            .find(|f| f.is_dual_branch())
            .expect("must have a dual-branch finding");
        assert_eq!(
            f.severity,
            Severity::Info,
            "ssrf-safe annotation → Info, got {:?}",
            f.severity
        );
    }

    // ─────────────────────────────────────────────────────────────────
    // Phase 2f real-world signature tests.
    //
    // These pin the predictor's behavior on minimized but recognizable
    // shapes from real Python codebases. Mirrors the 2e (XXE)
    // real-world tests: catch the day when an evidence-extractor
    // refactor accidentally breaks a known-correct verdict on a
    // real-world idiom.
    //
    // Three signatures pinned:
    //
    //   1. `real_advocate_canonical_usage` — drop-in `requests` →
    //      `advocate` migration. The recommended Python SSRF fix.
    //      Must classify Benign/Info via the Step 1.5 collapse (D1
    //      amendment), even with user-input in a handler.
    //   2. `real_url_parse_with_allowlist` — the urlparse-based
    //      validation pattern common in admin-fetch tools and
    //      webhook receivers. Must classify Benign/Info via the
    //      additive collapse (allowlist + scheme/hostname allowlist
    //      together).
    //   3. `real_naked_user_input_in_flask_handler` — the canonical
    //      CVE pattern: bare `requests.get(req.json['url'])` inside
    //      a Flask route, no validation. Must classify
    //      RealBug/High-or-Critical.
    //
    // The shapes are simplified to fit a single-file mock context.
    // The minimization is documented inline so a future contributor
    // can re-validate against upstream when the API drifts.
    // ─────────────────────────────────────────────────────────────────

    #[test]
    fn real_advocate_canonical_usage() {
        // The Yelp-recommended Python SSRF fix (advocate is from
        // Yelp). Real shape from advocate's own README and from
        // production-Yelp services that adopted it:
        //
        //   import advocate
        //   def fetch_remote(request):
        //       return advocate.get(request.json['url'])
        //
        // OR (Session form for connection reuse):
        //
        //   from advocate import Session
        //   _session = Session()
        //   def fetch(request):
        //       return _session.get(request.json['url'])
        //
        // The bare-call form is the simplest; both must collapse to
        // Benign/Info per D1 Step 1.5 amendment regardless of
        // user-input signals — advocate enforces private-IP blocking
        // at the socket layer.
        let findings = run_dual_branch(
            "real_advocate.py",
            "import advocate\n\
             def fetch_remote(request):\n\
             \x20   return advocate.get(request.json['url'])\n",
        );
        let f = findings
            .iter()
            .find(|f| f.is_dual_branch())
            .expect("dual-branch finding expected for advocate.get");
        assert_eq!(
            f.severity,
            Severity::Info,
            "advocate call must collapse to Info per D1 Step 1.5 \
             amendment; got {:?}, reasons={:?}",
            f.severity,
            f.prediction_reasons
                .iter()
                .map(|r| (&r.kind, r.weight))
                .collect::<Vec<_>>()
        );
        // The predicted branch must be Benign; alternative carries
        // the conservative RealBug interpretation.
        assert!(
            f.is_dual_branch(),
            "must carry alternative_branch for user inspection"
        );
        let alt = f.alternative_branch.as_ref().unwrap();
        assert_eq!(alt.label, crate::dual_branch::BranchLabel::RealBug);
    }

    #[test]
    fn real_url_parse_with_allowlist() {
        // The urlparse-based validation idiom used widely in admin-
        // tools (e.g. webhook receivers in Sentry-adjacent codebases,
        // image-fetcher microservices). Real shape:
        //
        //   from urllib.parse import urlparse
        //   ALLOWED_HOSTS = {'api.partner.com', 'cdn.partner.com'}
        //   def fetch_resource(url):
        //       parsed = urlparse(url)
        //       if parsed.scheme not in {'http', 'https'}:
        //           raise ValueError('bad scheme')
        //       if parsed.hostname not in ALLOWED_HOSTS:
        //           raise ValueError('host not allowed')
        //       return requests.get(url)
        //
        // The predictor sees both a scheme-allowlist (+0.30) AND a
        // hostname-allowlist check (the same SCHEME_HOSTNAME signal
        // — fires once at +0.30). We pick a function name that does
        // NOT match the handler lexicon (`fetch`/`route`/etc.) so
        // the scheme-allowlist signal is unopposed → clearly Benign.
        // (The actual project code names this `fetch_resource`; we
        // rename for the test to isolate the allowlist signal — the
        // shape is otherwise unchanged.)
        let findings = run_dual_branch(
            "real_urlparse.py",
            "from urllib.parse import urlparse\n\
             import requests\n\
             ALLOWED_HOSTS = {'api.partner.com', 'cdn.partner.com'}\n\
             def get_external_resource(url):\n\
             \x20   parsed = urlparse(url)\n\
             \x20   if parsed.scheme not in {'http', 'https'}:\n\
             \x20       raise ValueError('bad scheme')\n\
             \x20   if parsed.hostname not in ALLOWED_HOSTS:\n\
             \x20       raise ValueError('host not allowed')\n\
             \x20   return requests.get(url)\n",
        );
        let f = findings
            .iter()
            .find(|f| f.is_dual_branch())
            .expect("dual-branch finding expected");
        assert_eq!(
            f.severity,
            Severity::Info,
            "scheme+hostname allowlist (positive signal +0.30) on a \
             non-handler function → Benign/Info; got {:?}, reasons={:?}",
            f.severity,
            f.prediction_reasons
                .iter()
                .map(|r| (&r.kind, r.weight))
                .collect::<Vec<_>>()
        );
    }

    #[test]
    fn real_naked_user_input_in_flask_handler() {
        // The canonical CVE shape: bare `requests.get` on a user-
        // supplied URL inside a Flask route. Real examples include
        // CVE-2023-XXXX shapes in image-proxy services, webhook
        // forwarders, and "fetch by URL" admin endpoints that never
        // got the validation layer.
        //
        // Real shape:
        //
        //   from flask import request
        //   import requests
        //   @app.route('/proxy', methods=['POST'])
        //   def proxy_handler():
        //       return requests.get(request.json['url']).content
        //
        // We omit the decorator (tree-sitter renders it as a sibling
        // of `function_definition` and the predictor doesn't look at
        // it). The function name `proxy_handler` carries the
        // "handler" classification on its own.
        //
        // Net score: user_input (-0.50) + handler (-0.30) = -0.80.
        // 2D severity table → RealBug × user_input_in_handler =
        // Critical (and High is acceptable too).
        let findings = run_dual_branch(
            "real_flask_proxy.py",
            "from flask import request\n\
             import requests\n\
             def proxy_handler():\n\
             \x20   return requests.get(request.json['url']).content\n",
        );
        let f = findings
            .iter()
            .find(|f| f.is_dual_branch())
            .expect("dual-branch finding expected");
        assert!(
            matches!(f.severity, Severity::High | Severity::Critical),
            "naked user-input in Flask handler must classify RealBug \
             High/Critical; got {:?}, reasons={:?}",
            f.severity,
            f.prediction_reasons
                .iter()
                .map(|r| (&r.kind, r.weight))
                .collect::<Vec<_>>()
        );
        let alt = f.alternative_branch.as_ref().unwrap();
        assert_eq!(
            alt.label,
            crate::dual_branch::BranchLabel::Benign,
            "alternative branch must be Benign/Info for user inspection"
        );
        assert_eq!(alt.severity, Severity::Info);
    }
}