repotoire 0.9.0

//! XSS Detection

use crate::detectors::base::{is_test_file, Detector, DetectorConfig};
use crate::detectors::fast_search::{find_in, *};
use crate::detectors::taint::{TaintAnalysisResult, TaintAnalyzer, TaintCategory};
use crate::models::{deterministic_finding_id, Evidence, Finding, Severity, SourceSpan, Tier};
use anyhow::Result;
use regex::Regex;
use std::collections::HashSet;
use std::path::PathBuf;
use std::sync::LazyLock;

static XSS_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
    Regex::new(r"(?i)(innerHTML|outerHTML|document\.write|dangerouslySetInnerHTML|v-html|ng-bind-html|\[innerHTML\])").expect("valid regex")
});

pub struct XssDetector {
    repository_path: PathBuf,
    max_findings: usize,
    taint_analyzer: TaintAnalyzer,
    precomputed_cross: std::sync::OnceLock<Vec<crate::detectors::taint::TaintPath>>,
    precomputed_intra: std::sync::OnceLock<Vec<crate::detectors::taint::TaintPath>>,
}

impl XssDetector {
    pub fn new(repository_path: impl Into<PathBuf>) -> Self {
        Self {
            repository_path: repository_path.into(),
            max_findings: 50,
            taint_analyzer: TaintAnalyzer::new(),
            precomputed_cross: std::sync::OnceLock::new(),
            precomputed_intra: std::sync::OnceLock::new(),
        }
    }
}

impl Detector for XssDetector {
    fn name(&self) -> &'static str {
        "xss"
    }
    fn description(&self) -> &'static str {
        "Detects XSS vulnerabilities"
    }

    fn bypass_postprocessor(&self) -> bool {
        true
    }

    crate::detectors::impl_taint_precompute!();

    fn taint_category(&self) -> Option<crate::detectors::taint::TaintCategory> {
        Some(TaintCategory::Xss)
    }

    fn file_extensions(&self) -> &'static [&'static str] {
        &["py", "js", "ts", "jsx", "tsx", "rb", "php", "java"]
    }

    fn content_requirements(&self) -> crate::detectors::detector_context::ContentFlags {
        crate::detectors::detector_context::ContentFlags::HAS_TEMPLATE
    }

    fn detect(
        &self,
        ctx: &crate::detectors::analysis_context::AnalysisContext,
    ) -> Result<Vec<Finding>> {
        let graph = ctx.graph;
        let files = &ctx.as_file_provider();
        let mut findings = vec![];

        // Run taint analysis for XSS (precomputed or fallback)
        let mut taint_paths = if let Some(cross) = self.precomputed_cross.get() {
            cross.clone()
        } else {
            self.taint_analyzer.trace_taint(graph, TaintCategory::Xss)
        };
        let intra_paths = if let Some(intra) = self.precomputed_intra.get() {
            intra.clone()
        } else {
            crate::detectors::taint::run_intra_function_taint(
                &self.taint_analyzer,
                graph,
                TaintCategory::Xss,
                &self.repository_path,
            )
        };
        taint_paths.extend(intra_paths);
        let taint_result = TaintAnalysisResult::from_paths(taint_paths);

        // ── Blocking tier: SSA taint paths to HTML sinks ─────────────────────
        //
        // Gate on `!is_sanitized`, NOT `sanitizers_on_path.is_empty()`: the
        // intra-function heuristic engine sets only the boolean flag and always
        // leaves `sanitizers_on_path` empty (no sanitizer-name plumbing yet).
        //
        // (file, sink_line) pairs with a Blocking finding are tracked so the
        // line-heuristic loop below skips them rather than emitting a duplicate.
        let mut blocking_covered: HashSet<(&str, u32)> = HashSet::new();

        for tp in &taint_result.paths {
            if tp.category != TaintCategory::Xss {
                continue;
            }
            if tp.is_sanitized {
                continue;
            }

            blocking_covered.insert((tp.sink_file.as_str(), tp.sink_line));

            let source_span = SourceSpan {
                file: PathBuf::from(&tp.source_file),
                line_start: tp.source_line,
                line_end: tp.source_line,
                snippet: None,
            };
            let sink_span = SourceSpan {
                file: PathBuf::from(&tp.sink_file),
                line_start: tp.sink_line,
                line_end: tp.sink_line,
                snippet: None,
            };

            findings.push(Finding {
                id: deterministic_finding_id(
                    "XssDetector",
                    &tp.sink_file,
                    tp.sink_line,
                    "taint_html_sink",
                ),
                detector: "XssDetector".to_string(),
                severity: Severity::Critical,
                title: "XSS: tainted data flows to HTML sink (confirmed)".to_string(),
                description: format!(
                    "**Cross-Site Scripting — HTML Sink (CWE-79)**\n\n\
                     SSA taint analysis confirmed that user-controlled data from \
                     `{}` ({}:{}) reaches the HTML sink `{}` ({}:{}) without sanitization.\n\n\
                     **Taint path**: `{}`\n\n\
                     Attackers can inject arbitrary HTML/JavaScript, leading to session \
                     hijacking, credential theft, or malware delivery.",
                    tp.source_function,
                    tp.source_file,
                    tp.source_line,
                    tp.sink_callee_text,
                    tp.sink_file,
                    tp.sink_line,
                    tp.path_string(),
                ),
                affected_files: vec![PathBuf::from(&tp.sink_file)],
                line_start: Some(tp.sink_line),
                line_end: Some(tp.sink_line),
                suggested_fix: Some(
                    "Sanitize or encode HTML output (e.g. DOMPurify.sanitize, \
                     escapeHtml) or use textContent / innerText instead of innerHTML."
                        .to_string(),
                ),
                estimated_effort: Some("30 minutes".to_string()),
                category: Some("security".to_string()),
                cwe_id: Some("CWE-79".to_string()),
                why_it_matters: Some(
                    "XSS allows attackers to execute arbitrary scripts in the victim's browser."
                        .to_string(),
                ),
                tier: Tier::Blocking,
                deterministic: true,
                confidence: Some(0.95),
                evidence: Some(Evidence::TaintPath {
                    source: source_span,
                    sink: sink_span,
                    sink_kind: "html_sink".to_string(),
                    flow: vec![],
                    sanitizers_seen: tp.sanitizers_on_path.clone(),
                }),
                ..Default::default()
            });
        }

        for path in files.files_with_extensions(&["js", "ts", "jsx", "tsx", "vue", "html", "php"]) {
            if findings.len() >= self.max_findings {
                break;
            }

            let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");

            // Skip test files - they often have test fixtures with XSS patterns
            if is_test_file(path) {
                continue;
            }

            // Skip non-served static HTML files (mockups, specs, design docs, fixtures)
            let path_str_lower = path.to_string_lossy().to_lowercase();
            if ext == "html"
                && (path_str_lower.contains("/mockup")
                    || path_str_lower.contains("/mock-")
                    || path_str_lower.contains("/specs/")
                    || path_str_lower.contains("/spec/")
                    || path_str_lower.contains("/fixture")
                    || path_str_lower.contains("/example")
                    || path_str_lower.contains("/demo")
                    || path_str_lower.contains("/design/")
                    || path_str_lower.contains("/prototype")
                    || path_str_lower.contains("/wireframe")
                    || path_str_lower.contains("/static/"))
            {
                continue;
            }

            // For HTML files, check if data comes from hardcoded arrays (not user input)
            // If the file contains no form inputs, fetch calls, or URL params, it's static
            if ext == "html" {
                if let Some(content) = files.content(path) {
                    let content_str: &str = &content;
                    let has_dynamic_input = find_in(&FIND_FETCH_PAREN, content_str)
                        || find_in(&FIND_XMLHTTPREQUEST, content_str)
                        || find_in(&FIND_LOCATION_SEARCH, content_str)
                        || find_in(&FIND_LOCATION_HASH, content_str)
                        || find_in(&FIND_DOCUMENT_COOKIE, content_str)
                        || find_in(&FIND_WINDOW_NAME, content_str)
                        || find_in(&FIND_POSTMESSAGE, content_str);
                    if !has_dynamic_input {
                        continue; // Pure static HTML with hardcoded data
                    }
                }
            }

            // Skip framework internals (React/Vue/Angular core SSR code)
            if path_str_lower.contains("fizzconfig")  // React SSR core
                || path_str_lower.contains("server/react")
                || path_str_lower.contains("dom-bindings")  // React DOM bindings
                || path_str_lower.contains("/packages/react-dom/")
                || path_str_lower.contains("/packages/vue/")
                || path_str_lower.contains("/packages/angular/")
            {
                continue;
            }

            if let Some(content) = files.content(path) {
                let file_str = path.to_string_lossy();
                let lines: Vec<&str> = content.lines().collect();

                for (i, line) in lines.iter().enumerate() {
                    let prev_line = if i > 0 { Some(lines[i - 1]) } else { None };
                    if crate::detectors::is_line_suppressed(line, prev_line) {
                        continue;
                    }

                    let line_num_pre = (i + 1) as u32;
                    // Skip lines already covered by a Blocking taint finding.
                    if blocking_covered.contains(&(file_str.as_ref(), line_num_pre)) {
                        continue;
                    }

                    if XSS_PATTERN.is_match(line) {
                        // Word-boundary checks to avoid FPs like inputStream, maxInput (#24)
                        let line_lower = line.to_lowercase();
                        let has_user_input = find_in(&FIND_REQ_DOT, &line_lower)
                            || find_in(&FIND_PROPS_DOT, &line_lower)
                            || find_in(&FIND_REQ_PARAMS, &line_lower)
                            || find_in(&FIND_REQ_QUERY, &line_lower)
                            || line_lower.contains(".params[")
                            || line_lower.contains(".query[")
                            || find_in(&FIND_USER_INPUT, &line_lower)
                            || line_lower.contains("userinput")
                            || line_lower.contains("form_data")
                            || line_lower.contains("formdata")
                            || find_in(&FIND_REQUEST_BODY, &line_lower)
                            || find_in(&FIND_REQUEST_QUERY, &line_lower);

                        let line_num = line_num_pre;

                        // Check taint analysis for this location
                        let matching_taint = taint_result.paths.iter().find(|p| {
                            (p.sink_file == file_str || p.source_file == file_str)
                                && (p.sink_line == line_num || p.source_line == line_num)
                        });

                        // Adjust severity based on taint analysis
                        let (severity, description) = match matching_taint {
                            Some(taint_path) if taint_path.is_sanitized => {
                                // Sanitizer found - lower severity
                                (Severity::Low, format!(
                                    "Direct HTML injection can lead to XSS attacks.\n\n\
                                     **Taint Analysis Note**: A sanitizer function (`{}`) was found \
                                     in the data flow path, which may mitigate this vulnerability.",
                                    taint_path.sanitizer.as_deref().unwrap_or("unknown")
                                ))
                            }
                            Some(taint_path) => {
                                // Unsanitized taint path - critical
                                (Severity::Critical, format!(
                                    "Direct HTML injection can lead to XSS attacks.\n\n\
                                     **Taint Analysis Confirmed**: Data flow analysis traced a path \
                                     from user input to this XSS sink without sanitization:\n\n\
                                     `{}`",
                                    taint_path.path_string()
                                ))
                            }
                            None => {
                                // No taint path - use pattern-based severity
                                let sev = if has_user_input {
                                    Severity::Critical
                                } else {
                                    Severity::Medium
                                };
                                (
                                    sev,
                                    "Direct HTML injection can lead to XSS attacks.".to_string(),
                                )
                            }
                        };

                        findings.push(Finding {
                            id: String::new(),
                            detector: "XssDetector".to_string(),
                            severity,
                            title: "Potential XSS vulnerability".to_string(),
                            description,
                            affected_files: vec![path.to_path_buf()],
                            line_start: Some(line_num),
                            line_end: Some(line_num),
                            suggested_fix: Some(
                                "Sanitize input or use textContent instead.".to_string(),
                            ),
                            estimated_effort: Some("30 minutes".to_string()),
                            category: Some("security".to_string()),
                            cwe_id: Some("CWE-79".to_string()),
                            why_it_matters: Some(
                                "XSS allows attackers to execute scripts in users' browsers."
                                    .to_string(),
                            ),
                            ..Default::default()
                        });
                    }
                }
            }
        }

        // Filter out Low severity (sanitized) findings
        findings.retain(|f| f.severity != Severity::Low);

        Ok(findings)
    }
}

impl crate::detectors::RegisteredDetector for XssDetector {
    fn create(init: &crate::detectors::DetectorInit) -> std::sync::Arc<dyn Detector> {
        std::sync::Arc::new(Self::new(init.repo_path))
    }

    fn max_tier() -> crate::models::Tier {
        crate::models::Tier::Blocking
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::detectors::base::Detector;
    use crate::detectors::taint::{TaintCategory, TaintPath};
    use crate::graph::builder::GraphBuilder;
    use crate::models::{Evidence, Tier};

    /// Build a minimal XSS [`TaintPath`] for injection via
    /// [`Detector::set_precomputed_taint`].
    fn make_xss_taint_path(
        source_file: &str,
        source_line: u32,
        sink_file: &str,
        sink_line: u32,
        sink_callee_text: &str,
        is_sanitized: bool,
    ) -> TaintPath {
        TaintPath {
            source_function: "handler".to_string(),
            source_file: source_file.to_string(),
            source_line,
            sink_function: sink_callee_text.to_string(),
            sink_file: sink_file.to_string(),
            sink_line,
            category: TaintCategory::Xss,
            call_chain: vec![],
            is_sanitized,
            sanitizer: None,
            confidence: 0.95,
            sink_callee_text: sink_callee_text.to_string(),
            sanitizers_on_path: vec![],
        }
    }

    #[test]
    fn test_detects_innerhtml_with_user_input() {
        let store = GraphBuilder::new().freeze();
        let detector = XssDetector::new("/mock/repo");
        let ctx = crate::detectors::analysis_context::AnalysisContext::test_with_mock_files(&store, vec![
            ("vuln.js", "function renderContent(user_input) {\n    document.getElementById(\"output\").innerHTML = user_input;\n}\n"),
        ]);
        let findings = detector.detect(&ctx).expect("detection should succeed");
        assert!(
            !findings.is_empty(),
            "Should detect innerHTML assignment with user input"
        );
        assert!(
            findings.iter().any(|f| f.title.contains("XSS")),
            "Finding should mention XSS. Titles: {:?}",
            findings.iter().map(|f| &f.title).collect::<Vec<_>>()
        );
        assert!(
            findings
                .iter()
                .any(|f| f.cwe_id.as_deref() == Some("CWE-79")),
            "Finding should have CWE-79"
        );
    }

    #[test]
    fn test_no_findings_for_textcontent() {
        let store = GraphBuilder::new().freeze();
        let detector = XssDetector::new("/mock/repo");
        let ctx = crate::detectors::analysis_context::AnalysisContext::test_with_mock_files(&store, vec![
            ("safe.js", "function renderContent(data) {\n    document.getElementById(\"output\").textContent = data;\n}\n"),
        ]);
        let findings = detector.detect(&ctx).expect("detection should succeed");
        assert!(
            findings.is_empty(),
            "Using textContent should have no XSS findings, but got: {:?}",
            findings.iter().map(|f| &f.title).collect::<Vec<_>>()
        );
    }

    #[test]
    fn test_detects_document_write_in_js() {
        let store = GraphBuilder::new().freeze();
        let detector = XssDetector::new("/mock/repo");
        let ctx = crate::detectors::analysis_context::AnalysisContext::test_with_mock_files(&store, vec![
            ("handler.js", "function showMessage(req) {\n    const msg = req.query.message;\n    document.write(req.query.message);\n}\n"),
        ]);
        let findings = detector.detect(&ctx).expect("detection should succeed");
        assert!(
            !findings.is_empty(),
            "Should detect document.write with user input from req.query"
        );
        assert!(
            findings
                .iter()
                .any(|f| f.cwe_id.as_deref() == Some("CWE-79")),
            "Finding should have CWE-79"
        );
    }

    #[test]
    fn test_detects_dangerously_set_innerhtml_in_tsx() {
        let store = GraphBuilder::new().freeze();
        let detector = XssDetector::new("/mock/repo");
        let ctx = crate::detectors::analysis_context::AnalysisContext::test_with_mock_files(&store, vec![
            ("component.tsx", "function UserProfile(props) {\n    return <div dangerouslySetInnerHTML={{ __html: props.bio }} />;\n}\n"),
        ]);
        let findings = detector.detect(&ctx).expect("detection should succeed");
        assert!(
            !findings.is_empty(),
            "Should detect dangerouslySetInnerHTML with props in TSX"
        );
        assert!(
            findings.iter().any(|f| f.severity == Severity::Critical),
            "Should be Critical severity when user input (props.) is present"
        );
    }

    #[test]
    fn test_no_finding_for_innerhtml_in_comment() {
        let store = GraphBuilder::new().freeze();
        let detector = XssDetector::new("/mock/repo");
        let ctx = crate::detectors::analysis_context::AnalysisContext::test_with_mock_files(&store, vec![
            ("safe.js", "function render() {\n    // Never use innerHTML with user input\n    document.getElementById(\"out\").textContent = \"safe\";\n}\n"),
        ]);
        let findings = detector.detect(&ctx).expect("detection should succeed");
        // The comment mentions innerHTML but has no user input pattern on that line,
        // so it should either be empty or at most Medium (no user input marker).
        // Since there's no actual user input keyword on the comment line, it won't be Critical.
        let critical = findings
            .iter()
            .filter(|f| f.severity == Severity::Critical)
            .count();
        assert_eq!(
            critical, 0,
            "innerHTML in a comment should not produce Critical findings"
        );
    }

    #[test]
    fn test_no_finding_for_innerhtml_in_string_literal() {
        let store = GraphBuilder::new().freeze();
        let detector = XssDetector::new("/mock/repo");
        let ctx = crate::detectors::analysis_context::AnalysisContext::test_with_mock_files(&store, vec![
            ("safe.js", "function getDocs() {\n    const help = \"Use textContent instead of innerHTML for safety\";\n    return help;\n}\n"),
        ]);
        let findings = detector.detect(&ctx).expect("detection should succeed");
        // innerHTML appears inside a string literal, no actual DOM API call
        let critical = findings
            .iter()
            .filter(|f| f.severity == Severity::Critical)
            .count();
        assert_eq!(
            critical, 0,
            "innerHTML mentioned in a string literal should not produce Critical findings"
        );
    }

    // ── Blocking-tier tests (Task 8c) ─────────────────────────────────────────

    #[test]
    fn taint_to_dangerous_sink_is_blocking() {
        // Inject a pre-computed XSS taint path: unsanitized flow to innerHTML.
        // The detector should emit a Blocking finding with TaintPath evidence.
        let store = GraphBuilder::new().freeze();
        let detector = XssDetector::new("/mock/repo");

        let tp = make_xss_taint_path(
            "handler.js",
            1,
            "handler.js",
            2,
            "innerHTML",
            false, // not sanitized
        );
        detector.set_precomputed_taint(vec![], vec![tp]);

        // Provide file content so the line-scan loop also runs (but the taint
        // loop is independent and uses the pre-computed paths directly).
        let ctx = crate::detectors::analysis_context::AnalysisContext::test_with_mock_files(
            &store,
            vec![(
                "handler.js",
                "function r(req) {\n  el.innerHTML = req.query.name;\n}\n",
            )],
        );
        let findings = detector.detect(&ctx).expect("detection should succeed");

        let blocking: Vec<_> = findings
            .iter()
            .filter(|f| f.tier == Tier::Blocking)
            .collect();
        assert!(
            !blocking.is_empty(),
            "Expected at least one Blocking finding, got: {:?}",
            findings
                .iter()
                .map(|f| (&f.title, &f.tier))
                .collect::<Vec<_>>()
        );
        let f = &blocking[0];
        assert_eq!(
            f.severity,
            Severity::Critical,
            "Blocking XSS must be Critical"
        );
        assert!(f.deterministic, "Blocking finding must be deterministic");
        assert!(
            f.confidence.unwrap_or(0.0) >= 0.90,
            "Blocking finding confidence must be >= 0.90, got {:?}",
            f.confidence
        );
        assert!(
            matches!(
                &f.evidence,
                Some(Evidence::TaintPath { sink_kind, .. }) if sink_kind == "html_sink"
            ),
            "Evidence must be TaintPath {{ sink_kind: \"html_sink\" }}, got {:?}",
            f.evidence
        );
    }

    #[test]
    fn sanitized_path_is_advisory() {
        // A sanitized taint path must NOT produce a Blocking finding.
        let store = GraphBuilder::new().freeze();
        let detector = XssDetector::new("/mock/repo");

        let tp = make_xss_taint_path(
            "handler.js",
            1,
            "handler.js",
            2,
            "innerHTML",
            true, // sanitized
        );
        detector.set_precomputed_taint(vec![], vec![tp]);

        let ctx = crate::detectors::analysis_context::AnalysisContext::test_with_mock_files(
            &store,
            // Sanitized path: DOMPurify.sanitize wraps the input before innerHTML.
            vec![(
                "handler.js",
                "function r(req) {\n  el.innerHTML = DOMPurify.sanitize(req.query.name);\n}\n",
            )],
        );
        let findings = detector.detect(&ctx).expect("detection should succeed");

        assert!(
            findings.iter().all(|f| f.tier != Tier::Blocking),
            "Sanitized taint path must not produce Blocking findings, got: {:?}",
            findings
                .iter()
                .map(|f| (&f.title, &f.tier))
                .collect::<Vec<_>>()
        );
        assert!(
            findings.iter().all(|f| f.evidence.is_none()),
            "Sanitized path findings must have no evidence, got: {:?}",
            findings.iter().map(|f| &f.evidence).collect::<Vec<_>>()
        );
    }

    #[test]
    fn taint_to_benign_callee_is_advisory() {
        // A taint path with a non-XSS category must not trigger Blocking
        // (the XSS detector only promotes paths whose category is XSS).
        let store = GraphBuilder::new().freeze();
        let detector = XssDetector::new("/mock/repo");

        // Inject a CommandInjection path — the XSS detector must ignore it
        // (category != Xss → is_html_sink returns false).
        let tp = TaintPath {
            source_function: "handler".to_string(),
            source_file: "handler.js".to_string(),
            source_line: 1,
            sink_function: "exec".to_string(),
            sink_file: "handler.js".to_string(),
            sink_line: 2,
            category: TaintCategory::CommandInjection,
            call_chain: vec![],
            is_sanitized: false,
            sanitizer: None,
            confidence: 0.95,
            sink_callee_text: "exec".to_string(),
            sanitizers_on_path: vec![],
        };
        detector.set_precomputed_taint(vec![], vec![tp]);

        let ctx = crate::detectors::analysis_context::AnalysisContext::test_with_mock_files(
            &store,
            vec![(
                "handler.js",
                "function r(req) {\n  exec(req.query.cmd);\n}\n",
            )],
        );
        let findings = detector.detect(&ctx).expect("detection should succeed");

        assert!(
            findings.iter().all(|f| f.tier != Tier::Blocking),
            "Non-HTML-sink taint path must not produce Blocking findings, got: {:?}",
            findings
                .iter()
                .map(|f| (&f.title, &f.tier))
                .collect::<Vec<_>>()
        );
    }

    #[test]
    fn line_heuristic_match_is_advisory() {
        // A finding produced purely by the regex/line heuristic (no taint path)
        // must be Advisory (the default tier) with no evidence.
        let store = GraphBuilder::new().freeze();
        let detector = XssDetector::new("/mock/repo");
        // No taint paths injected.

        let ctx = crate::detectors::analysis_context::AnalysisContext::test_with_mock_files(
            &store,
            vec![(
                "handler.js",
                "function r(req) {\n  el.innerHTML = req.query.name;\n}\n",
            )],
        );
        let findings = detector.detect(&ctx).expect("detection should succeed");

        // Must have at least one finding (the line heuristic fires on innerHTML + req.query).
        assert!(
            !findings.is_empty(),
            "Line heuristic should produce at least one finding"
        );
        assert!(
            findings.iter().all(|f| f.tier != Tier::Blocking),
            "Line-heuristic findings must be Advisory (no taint path injected), got: {:?}",
            findings
                .iter()
                .map(|f| (&f.title, &f.tier))
                .collect::<Vec<_>>()
        );
        assert!(
            findings.iter().all(|f| f.evidence.is_none()),
            "Line-heuristic findings must have no evidence, got: {:?}",
            findings.iter().map(|f| &f.evidence).collect::<Vec<_>>()
        );
    }
}