pyrograph 0.1.0

use crate::ir::{EdgeKind, NodeId, TaintGraph};
use crate::labels::TaintLabel;
use crate::lib_types::{Severity, TaintFinding};
use std::collections::{HashMap, HashSet, VecDeque};

/// Check if a source category → sink category combination is dangerous.
/// This prevents false positives like process.env.PORT → res.send() (normal config)
/// while allowing process.env.TOKEN → fetch() (exfiltration).
pub fn is_dangerous_combination(source_category: &str, sink_category: &str) -> bool {
    match (source_category, sink_category) {
        // Malware: credentials/secrets exfiltrated or executed
        ("credential", "exec") => true,     // eval(process.env.CMD)
        ("credential", "network") => true,   // fetch(process.env.TOKEN)
        ("credential", "file") => true,      // writeFile('.npmrc', token)
        // Malware: system info exfiltrated or used for targeting
        ("system", "network") => true,       // fetch(os.hostname())
        ("system", "exec") => true,          // exec(os.platform()-based payload)
        ("system", "file") => true,          // writeFile based on os.platform()
        // Malware: sensitive files stolen
        ("sensitive-file", "network") => true, // fetch(readFile('.npmrc'))
        ("sensitive-file", "exec") => true,    // eval(readFile('payload.js'))
        // Malware: file content exfiltrated
        ("file", "network") => true,         // fetch(readFileSync('secret'))
        ("file", "exec") => true,            // eval(readFileSync('payload'))
        ("file", "sql") => true,             // SQL injection from file content
        // Malware: network data executed (fetch→eval supply chain)
        ("network-input", "exec") => true,   // eval(fetched_code)
        ("network-input", "network") => true, // proxy/relay through another fetch
        ("network-input", "file") => true,    // write fetched payload to disk
        ("network-input", "sql") => true,     // second-order SQL injection via fetched data
        ("network-input", "xss") => true,     // stored XSS via fetched data
        // Malware: buffer/decoded data executed or persisted
        ("buffer", "exec") => true,          // eval(Buffer.from(data))
        ("buffer", "network") => true,       // fetch(decoded_payload)
        ("buffer", "file") => true,          // writeFile(path, Buffer.from(payload))
        // Malware: CLI args leading to exec/network
        ("cli", "exec") => true,             // exec(process.argv[2])
        ("cli", "network") => true,          // fetch(process.argv[2])
        ("cli", "file") => true,             // write(input()) / dropper staging
        ("cli", "sql") => true,              // SQL injection from CLI args
        // Malware: shell commands
        ("shell", "exec") => true,           // exec('curl ...')
        ("shell", "network") => true,
        // Vulnerabilities: user HTTP input → dangerous sinks
        ("http", "exec") => true,            // command injection
        ("http", "sql") => true,             // SQL injection
        ("http", "xss") => true,             // XSS
        ("http", "network") => true,         // SSRF
        ("http", "file") => true,            // path traversal write
        // npm script sources → any dangerous sink
        ("npm-script", "exec") => true,
        ("npm-script", "network") => true,
        ("npm-script", "file") => true,      // install script dropping payloads
        // Second-order attacks: decoded/buffered data into vuln sinks
        ("buffer", "sql") => true,           // SQL injection via base64 decoded data
        ("buffer", "xss") => true,           // XSS via decoded data
        // Sensitive file staging
        ("sensitive-file", "file") => true,  // copy .npmrc to staging location
        // Everything else: NOT dangerous by default
        // credential → sql is normal (DB config with env vars)
        // credential → xss is impossible (env vars don't cause XSS)
        // system → file is normal (writing logs with hostname)
        // system → sql is normal (storing system info in DB)
        _ => false,
    }
}

pub fn analyze_cpu(graph: &TaintGraph) -> Vec<TaintFinding> {
    let mut findings = Vec::new();
    let mut seen_findings = HashSet::new();

    for source_node in graph.sources() {
        let source_enum = match source_node.label {
            Some(TaintLabel::Source(s)) => s,
            Some(TaintLabel::Both(s, _)) => s,
            _ => continue,
        };

        // Look up source category for taint coloring
        let source_category = graph
            .label_set()
            .and_then(|labels| labels.sources.get(source_enum))
            .map(|s| s.category.as_str())
            .unwrap_or("unknown");

        let mut queue = VecDeque::new();
        let mut visited = HashSet::new();
        let mut parent = HashMap::new();

        queue.push_back(source_node.id);
        visited.insert(source_node.id);

        while let Some(current_id) = queue.pop_front() {
            let stop_here = graph
                .node(current_id)
                .and_then(|node| node.label)
                .is_some_and(|label| label.is_sanitizer());

            if stop_here {
                continue;
            }

            for (neighbor_id, edge_kind) in graph.edges_from(current_id) {
                if neighbor_id == source_node.id {
                    continue;
                }

                if !is_dataflow_edge(&edge_kind) {
                    continue;
                }

                if !visited.contains(&neighbor_id) {
                    visited.insert(neighbor_id);
                    parent.insert(neighbor_id, current_id);
                    queue.push_back(neighbor_id);
                }

                let Some(neighbor) = graph.node(neighbor_id) else {
                    continue;
                };
                let Some(label) = neighbor.label else {
                    continue;
                };

                let sink_enum = match label {
                    TaintLabel::Sink(s) => s,
                    TaintLabel::Both(_, s) => s,
                    _ => continue,
                };

                // Taint coloring: check if this source→sink combination is dangerous.
                // This prevents FPs like process.env.PORT → db.query() (normal config)
                // while catching process.env.TOKEN → fetch() (exfiltration).
                let sink_category = graph
                    .label_set()
                    .and_then(|labels| labels.sinks.get(sink_enum))
                    .map(|s| s.category.as_str())
                    .unwrap_or("unknown");

                let has_label_set = graph.label_set().is_some();
                // Taint coloring: only flag dangerous (source_category, sink_category)
                // combinations. Without label set, flag everything (backward compat).
                let dangerous = !has_label_set
                    || is_dangerous_combination(source_category, sink_category);

                if dangerous && seen_findings.insert((source_enum, sink_enum, neighbor_id)) {
                    // `dangerous` is already true here, so severity comes from
                    // the sink category. The `dangerous` flag already checked
                    // is_dangerous_combination — no need to recheck.
                    let severity = severity_for_sink(graph, neighbor_id, sink_enum);
                    findings.push(TaintFinding {
                        source: source_enum,
                        sink: sink_enum,
                        path: reconstruct_path(&parent, source_node.id, neighbor_id),
                        severity,
                    });
                }
            }
        }
    }

    findings
}

fn reconstruct_path(parent: &HashMap<NodeId, NodeId>, source: NodeId, sink: NodeId) -> Vec<NodeId> {
    let mut path = vec![sink];
    let mut current = sink;
    while current != source {
        if let Some(&p) = parent.get(&current) {
            current = p;
            path.push(current);
        } else {
            break;
        }
    }
    path.reverse();
    path
}

fn severity_for_sink(graph: &TaintGraph, sink_id: NodeId, sink_idx: usize) -> Severity {
    let category = graph
        .label_set()
        .and_then(|labels| labels.sinks.get(sink_idx))
        .map(|sink| sink.category.as_str())
        .or_else(|| {
            graph.node(sink_id).map(|node| {
                if node.name.contains("eval")
                    || node.name.contains("exec")
                    || node.name.contains("Function")
                {
                    "exec"
                } else if node.name.contains("fetch")
                    || node.name.contains("request")
                    || node.name.contains("dns.")
                    || node.name.contains("net.")
                {
                    "network"
                } else if node.name.contains("fs.writeFile") || node.name.contains("fs.appendFile")
                {
                    "file"
                } else {
                    "other"
                }
            })
        })
        .unwrap_or("other");

    match category {
        "exec" => Severity::Critical,
        "sql" => Severity::Critical,    // SQL injection
        "network" => Severity::High,
        "xss" => Severity::High,        // Cross-site scripting
        "file" => Severity::Medium,
        _ => Severity::Low,
    }
}

pub fn is_dataflow_edge(edge_kind: &EdgeKind) -> bool {
    matches!(
        edge_kind,
        EdgeKind::Assignment | EdgeKind::Argument | EdgeKind::Return
    )
}