aegis-scan 0.2.0

Supply chain security CLI for npm — detect malicious packages before installing
use std::path::PathBuf;
use std::sync::OnceLock;

use regex::Regex;

use crate::types::{Finding, FindingCategory, Severity};

use super::{truncate, Analyzer};

// ---------------------------------------------------------------------------
// Pattern definitions
// ---------------------------------------------------------------------------

struct Pattern {
    regex: &'static OnceLock<Regex>,
    severity: Severity,
    category: FindingCategory,
    title: &'static str,
    description: &'static str,
}

macro_rules! def_pattern {
    ($name:ident) => {
        static $name: OnceLock<Regex> = OnceLock::new();
    };
}

// CRITICAL
def_pattern!(RE_EVAL_DYNAMIC);
def_pattern!(RE_FUNCTION_CTOR);
def_pattern!(RE_BUFFER_EVAL);
def_pattern!(RE_CHILD_PROC_EXEC);
def_pattern!(RE_PIPE_TO_SHELL);

// HIGH
def_pattern!(RE_REQUIRE_CHILD_PROC);
def_pattern!(RE_IMPORT_CHILD_PROC);
def_pattern!(RE_ENV_HARVEST);
def_pattern!(RE_SENSITIVE_READ);
def_pattern!(RE_RAW_SOCKET);

// MEDIUM
def_pattern!(RE_HTTP_HARDCODED_IP);
def_pattern!(RE_DNS_EXFIL);
def_pattern!(RE_FS_WRITE_SYNC);
def_pattern!(RE_WEBSOCKET);
def_pattern!(RE_CRYPTO_DECIPHER);

// LOW
def_pattern!(RE_FETCH_DYNAMIC);
def_pattern!(RE_XHR);
def_pattern!(RE_FS_READ);

fn patterns() -> &'static [Pattern] {
    static PATTERNS: OnceLock<Vec<Pattern>> = OnceLock::new();
    PATTERNS.get_or_init(|| {
        // Initialise every regex on first access
        RE_EVAL_DYNAMIC.get_or_init(|| {
            // eval( with dynamic content -- exclude eval("literal")
            Regex::new(r#"eval\s*\([^"'][^)]*\)"#).unwrap()
        });
        RE_FUNCTION_CTOR.get_or_init(|| Regex::new(r#"(?i)new\s+Function\s*\("#).unwrap());
        RE_BUFFER_EVAL.get_or_init(|| {
            // Buffer.from(...) on same line or nearby with eval/Function
            Regex::new(r#"Buffer\.from\s*\(.*(?:eval|Function)"#).unwrap()
        });
        RE_CHILD_PROC_EXEC.get_or_init(|| {
            // Must have child_process require/import nearby, not just any .exec() call
            Regex::new(r#"child_process['")\]]\s*\.\s*(?:exec|execSync|spawn|spawnSync|execFile|fork)\s*\("#).unwrap()
        });
        RE_PIPE_TO_SHELL
            .get_or_init(|| Regex::new(r#"(?:curl|wget)\s+[^\|]*\|\s*(?:bash|sh)\b"#).unwrap());

        RE_REQUIRE_CHILD_PROC
            .get_or_init(|| Regex::new(r#"require\s*\(\s*['"]child_process['"]\s*\)"#).unwrap());
        RE_IMPORT_CHILD_PROC
            .get_or_init(|| Regex::new(r#"import\s+.*from\s+['"]child_process['"]\s*"#).unwrap());
        RE_ENV_HARVEST.get_or_init(|| {
            // Two or more process.env accesses on the same line (harvesting)
            Regex::new(r#"process\.env\b.*process\.env\b"#).unwrap()
        });
        RE_SENSITIVE_READ.get_or_init(|| {
            Regex::new(
                r#"fs\.readFileSync\s*\(\s*['"](?:/etc/passwd|/etc/shadow|~/.ssh|~/.aws|~/.npmrc)"#,
            )
            .unwrap()
        });
        RE_RAW_SOCKET
            .get_or_init(|| Regex::new(r#"(?:net\.connect|dgram\.createSocket)\s*\("#).unwrap());

        RE_HTTP_HARDCODED_IP.get_or_init(|| {
            Regex::new(r#"https?\.request\s*\(\s*['"]https?://\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}"#)
                .unwrap()
        });
        RE_DNS_EXFIL.get_or_init(|| Regex::new(r#"dns\.(?:lookup|resolve)\s*\("#).unwrap());
        RE_FS_WRITE_SYNC.get_or_init(|| Regex::new(r#"fs\.writeFileSync\s*\("#).unwrap());
        RE_WEBSOCKET.get_or_init(|| Regex::new(r#"new\s+WebSocket\s*\("#).unwrap());
        RE_CRYPTO_DECIPHER.get_or_init(|| Regex::new(r#"crypto\.createDecipher\s*\("#).unwrap());

        RE_FETCH_DYNAMIC.get_or_init(|| {
            // fetch( with a variable, not a plain string literal
            Regex::new(r#"fetch\s*\([^"'][^)]*\)"#).unwrap()
        });
        RE_XHR.get_or_init(|| Regex::new(r#"XMLHttpRequest"#).unwrap());
        RE_FS_READ.get_or_init(|| Regex::new(r#"fs\.(?:readFileSync|readFile)\s*\("#).unwrap());

        vec![
            // CRITICAL
            Pattern {
                regex: &RE_EVAL_DYNAMIC,
                severity: Severity::Critical,
                category: FindingCategory::CodeExecution,
                title: "Dynamic eval() detected",
                description: "eval() with dynamic content can execute arbitrary code",
            },
            Pattern {
                regex: &RE_FUNCTION_CTOR,
                severity: Severity::Critical,
                category: FindingCategory::CodeExecution,
                title: "Function constructor with dynamic string",
                description:
                    "new Function() can execute arbitrary code, often used for obfuscation",
            },
            Pattern {
                regex: &RE_BUFFER_EVAL,
                severity: Severity::Critical,
                category: FindingCategory::Obfuscation,
                title: "Buffer.from + eval/Function obfuscation",
                description: "Decoding a buffer and evaluating it is a common malware pattern",
            },
            Pattern {
                regex: &RE_CHILD_PROC_EXEC,
                severity: Severity::Critical,
                category: FindingCategory::ProcessSpawn,
                title: "child_process exec/spawn call",
                description: "Direct command execution via child_process",
            },
            Pattern {
                regex: &RE_PIPE_TO_SHELL,
                severity: Severity::Critical,
                category: FindingCategory::ProcessSpawn,
                title: "Pipe-to-shell pattern (curl|bash)",
                description: "Downloading and executing remote scripts is extremely dangerous",
            },
            // HIGH
            Pattern {
                regex: &RE_REQUIRE_CHILD_PROC,
                severity: Severity::High,
                category: FindingCategory::ProcessSpawn,
                title: "require('child_process')",
                description: "Package imports child_process module",
            },
            Pattern {
                regex: &RE_IMPORT_CHILD_PROC,
                severity: Severity::High,
                category: FindingCategory::ProcessSpawn,
                title: "import from 'child_process'",
                description: "Package imports child_process module via ESM",
            },
            Pattern {
                regex: &RE_ENV_HARVEST,
                severity: Severity::High,
                category: FindingCategory::EnvAccess,
                title: "Environment variable harvesting",
                description: "Multiple process.env accesses suggest credential harvesting",
            },
            Pattern {
                regex: &RE_SENSITIVE_READ,
                severity: Severity::High,
                category: FindingCategory::FileSystemAccess,
                title: "Sensitive file read",
                description: "Reading sensitive system files (passwd, ssh keys, credentials)",
            },
            Pattern {
                regex: &RE_RAW_SOCKET,
                severity: Severity::High,
                category: FindingCategory::NetworkAccess,
                title: "Raw network socket",
                description: "Raw TCP/UDP socket usage outside normal HTTP patterns",
            },
            // MEDIUM
            Pattern {
                regex: &RE_HTTP_HARDCODED_IP,
                severity: Severity::Medium,
                category: FindingCategory::NetworkAccess,
                title: "HTTP request to hardcoded IP",
                description: "HTTP requests to raw IP addresses are suspicious",
            },
            Pattern {
                regex: &RE_DNS_EXFIL,
                severity: Severity::Medium,
                category: FindingCategory::NetworkAccess,
                title: "DNS lookup/resolve",
                description: "DNS operations can be used for data exfiltration",
            },
            Pattern {
                regex: &RE_FS_WRITE_SYNC,
                severity: Severity::Medium,
                category: FindingCategory::FileSystemAccess,
                title: "Synchronous file write",
                description: "fs.writeFileSync detected -- verify target path is safe",
            },
            Pattern {
                regex: &RE_WEBSOCKET,
                severity: Severity::Medium,
                category: FindingCategory::NetworkAccess,
                title: "WebSocket connection",
                description: "WebSocket connections can be used for C2 communication",
            },
            Pattern {
                regex: &RE_CRYPTO_DECIPHER,
                severity: Severity::Medium,
                category: FindingCategory::Obfuscation,
                title: "Crypto decipher usage",
                description: "Decrypting payloads at runtime may indicate hidden malicious code",
            },
            // LOW
            Pattern {
                regex: &RE_FETCH_DYNAMIC,
                severity: Severity::Low,
                category: FindingCategory::NetworkAccess,
                title: "fetch() with dynamic URL",
                description: "Network request with a dynamic URL",
            },
            Pattern {
                regex: &RE_XHR,
                severity: Severity::Low,
                category: FindingCategory::NetworkAccess,
                title: "XMLHttpRequest usage",
                description: "Legacy XHR detected -- uncommon in modern Node packages",
            },
            Pattern {
                regex: &RE_FS_READ,
                severity: Severity::Low,
                category: FindingCategory::FileSystemAccess,
                title: "File read operation",
                description: "File system read detected -- verify it reads expected paths",
            },
        ]
    })
}

// ---------------------------------------------------------------------------
// Analyzer
// ---------------------------------------------------------------------------

/// Check if a line is a comment.
fn is_comment_line(line: &str) -> bool {
    let trimmed = line.trim_start();
    trimmed.starts_with("//")
        || trimmed.starts_with('*')
        || trimmed.starts_with("/*")
        || trimmed.starts_with('#')
}

/// Check if a pattern match is expected/benign in context.
fn is_expected_pattern(path: &str, line: &str, pat: &Pattern) -> bool {
    // process.env used for config (single access per line, not harvesting)
    // The ENV_HARVEST pattern requires 2+ accesses on the same line.
    // But also skip if the file is clearly a config/proxy/env utility.
    if matches!(pat.category, FindingCategory::EnvAccess) {
        let config_paths = [
            "config", "env", "proxy", "defaults", "helpers", "utils", "settings",
        ];
        if config_paths.iter().any(|p| path.to_lowercase().contains(p)) {
            return true;
        }
    }

    // XMLHttpRequest/fetch in adapter/transport files is expected
    if matches!(pat.category, FindingCategory::NetworkAccess)
        && matches!(pat.severity, Severity::Low)
    {
        let network_paths = [
            "adapters/",
            "transport/",
            "request",
            "http",
            "fetch",
            "xhr",
            "client",
        ];
        if network_paths
            .iter()
            .any(|p| path.to_lowercase().contains(p))
        {
            return true;
        }
    }

    // fs.readFile/writeFile in build/test/scripts is expected
    if matches!(pat.category, FindingCategory::FileSystemAccess)
        && matches!(pat.severity, Severity::Low)
    {
        let fs_paths = ["scripts/", "test/", "tests/", "__tests__/", "build/"];
        if fs_paths.iter().any(|p| path.contains(p)) {
            return true;
        }
    }

    _ = line; // suppress unused warning
    false
}

/// Static code analysis for dangerous patterns (eval, child_process, etc.).
pub struct StaticCodeAnalyzer;

impl Analyzer for StaticCodeAnalyzer {
    fn analyze(
        &self,
        files: &[(PathBuf, String)],
        _package_json: &serde_json::Value,
    ) -> Vec<Finding> {
        let pats = patterns();
        let mut findings = Vec::new();

        for (path, content) in files {
            // Only scan JS/TS files
            let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
            if !matches!(ext, "js" | "cjs" | "mjs" | "ts" | "tsx" | "jsx") {
                continue;
            }

            // Skip minified files — they trigger too many false positives
            let file_name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
            if file_name.contains(".min.") {
                continue;
            }

            let path_str = path.display().to_string();
            // Skip dist/bundle directories entirely — these are build outputs
            // and produce massive false positives (XHR, fetch, process.env, etc.)
            let path_lower = path_str.to_lowercase();
            let is_dist = path_lower.contains("/dist/")
                || path_lower.contains("/bundle/")
                || path_lower.contains("/build/")
                || path_lower.contains("/umd/")
                || path_lower.contains("/cjs/")
                || path_lower.contains("/esm/")
                || path_lower.starts_with("dist/")
                || path_lower.starts_with("bundle/")
                || path_lower.starts_with("build/")
                || path_lower.starts_with("umd/")
                || path_lower.starts_with("cjs/")
                || path_lower.starts_with("esm/");
            if is_dist {
                continue;
            }

            for (line_num, line) in content.lines().enumerate() {
                for pat in pats {
                    let re = pat.regex.get().expect("pattern not initialised");
                    if re.is_match(line) {
                        // Skip comments
                        if is_comment_line(line) {
                            continue;
                        }

                        // Skip expected patterns in source files
                        if is_expected_pattern(&path_str, line, pat) {
                            continue;
                        }

                        findings.push(Finding {
                            severity: pat.severity,
                            category: pat.category.clone(),
                            title: pat.title.to_string(),
                            description: pat.description.to_string(),
                            file: Some(path.display().to_string()),
                            line: Some(line_num + 1),
                            snippet: Some(truncate(line, 100)),
                        });
                    }
                }
            }
        }

        findings
    }
}