nyx-scanner 0.3.0

A CLI security scanner for automating vulnerability checks
Documentation
use crate::labels::{Cap, DataLabel};
use crate::symbol::{FuncKey, Lang, normalize_namespace};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;

/// Serialisable summary of a single function's taint behaviour.
///
/// One of these is produced per function during **pass 1** of a scan and
/// persisted to the `function_summaries` SQLite table.  During **pass 2** the
/// full set of summaries across every file is loaded into memory so the taint
/// engine can resolve cross‑file calls.
///
/// Design notes
/// ────────────
/// * **All three cap fields are independent.**  A function can simultaneously
///   act as a source (introduces fresh taint), a sanitizer (cleans certain
///   bits), and a sink (passes tainted data to a dangerous operation).
///   The old code picked a single `DataLabel` which lost information.
///
/// * **`propagates_taint`** captures pass‑through behaviour: if an input
///   parameter is tainted, does the return value carry that taint?  This is
///   essential for chains like `let y = transform(tainted_x); sink(y);`.
///
/// * **`callees`** are recorded for future call‑graph construction
///   (topological analysis, approach 2) but are not used in pass‑1/pass‑2
///   taint resolution yet.
///
/// * **`tainted_sink_params`** marks which parameter *positions* flow to
///   internal sinks.  Today the taint engine treats the whole call as a
///   single "tainted or not" question; this field future‑proofs the summary
///   for per‑argument precision.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FuncSummary {
    /// Function name as it appears in the source (`my_func`, not the full path).
    pub name: String,

    /// Absolute path of the file that defines this function.
    pub file_path: String,

    /// Language slug (`"rust"`, `"javascript"`, …).
    pub lang: String,

    // ── Signature information ────────────────────────────────────────────
    /// Total number of parameters (including `self`/`&self` for methods).
    pub param_count: usize,

    /// Parameter names in declaration order.
    pub param_names: Vec<String>,

    // ── Taint behaviour ──────────────────────────────────────────────────
    // Stored as raw `u8` so serde doesn't need to know about `bitflags`.
    /// Caps this function **introduces** — i.e. the return value carries
    /// freshly‑tainted data even if no argument was tainted.
    pub source_caps: u8,

    /// Caps this function **cleans** — passing tainted data through this
    /// function strips the corresponding bits.
    pub sanitizer_caps: u8,

    /// Caps this function **consumes unsafely** — calling it with tainted
    /// arguments that still carry these bits is a finding.
    pub sink_caps: u8,

    /// `true` when taint on *any* input parameter can flow through to the
    /// return value.  Conservative: set to `true` if *any* code path
    /// propagates an argument to the return expression.
    pub propagates_taint: bool,

    /// Indices of parameters that flow to internal sinks (0‑based).
    pub tainted_sink_params: Vec<usize>,

    /// Names of functions/methods/macros called inside this function body.
    /// Stored for future call‑graph / topological‑sort analysis.
    pub callees: Vec<String>,
}

// ── Cap conversion helpers ──────────────────────────────────────────────

impl FuncSummary {
    #[inline]
    pub fn source_caps(&self) -> Cap {
        Cap::from_bits_truncate(self.source_caps)
    }

    #[inline]
    pub fn sanitizer_caps(&self) -> Cap {
        Cap::from_bits_truncate(self.sanitizer_caps)
    }

    #[inline]
    pub fn sink_caps(&self) -> Cap {
        Cap::from_bits_truncate(self.sink_caps)
    }

    /// Collapse the three independent cap fields back into the single
    /// `DataLabel` that the current taint engine expects.
    ///
    /// Priority: **Sink > Source > Sanitizer**.  Sinks first because
    /// missing a dangerous call‑site is worse than a false‑positive on a
    /// source.  Sources beat sanitizers because an un‑tracked source is
    /// a missed vulnerability, while an un‑tracked sanitizer only causes
    /// false positives.
    #[allow(dead_code)]
    pub fn primary_label(&self) -> Option<DataLabel> {
        let sink = self.sink_caps();
        let src = self.source_caps();
        let san = self.sanitizer_caps();

        if !sink.is_empty() {
            Some(DataLabel::Sink(sink))
        } else if !src.is_empty() {
            Some(DataLabel::Source(src))
        } else if !san.is_empty() {
            Some(DataLabel::Sanitizer(san))
        } else {
            None
        }
    }

    /// Returns `true` when this function has **any** observable taint
    /// effect — it is a source, sanitizer, sink, or propagates taint.
    #[allow(dead_code)]
    pub fn is_interesting(&self) -> bool {
        self.source_caps != 0
            || self.sanitizer_caps != 0
            || self.sink_caps != 0
            || self.propagates_taint
    }

    /// Build a [`FuncKey`] from this summary, normalizing the namespace
    /// relative to `scan_root`.
    pub fn func_key(&self, scan_root: Option<&str>) -> FuncKey {
        FuncKey {
            lang: Lang::from_slug(&self.lang).unwrap_or(Lang::Rust),
            namespace: normalize_namespace(&self.file_path, scan_root),
            name: self.name.clone(),
            arity: Some(self.param_count),
        }
    }
}

// ── Lookup map used by the taint engine ─────────────────────────────────

/// A merged view of all function summaries keyed by qualified [`FuncKey`].
///
/// Functions are partitioned by language + namespace + name + arity.  Two
/// functions with the same bare name but different languages or namespaces
/// are stored separately — no implicit cross-language merging occurs.
///
/// A secondary index `(Lang, name)` supports fast lookup by language + name
/// for same-language resolution in the taint engine.
#[derive(Default)]
pub struct GlobalSummaries {
    by_key: HashMap<FuncKey, FuncSummary>,
    by_lang_name: HashMap<(Lang, String), Vec<FuncKey>>,
}

impl GlobalSummaries {
    pub fn new() -> Self {
        Self::default()
    }

    /// Insert or merge a summary.  If an exact `FuncKey` match exists,
    /// merge conservatively (OR caps/booleans, union params/callees).
    pub fn insert(&mut self, key: FuncKey, summary: FuncSummary) {
        let lang = key.lang;
        let name = key.name.clone();

        self.by_key
            .entry(key.clone())
            .and_modify(|existing| {
                existing.source_caps |= summary.source_caps;
                existing.sanitizer_caps |= summary.sanitizer_caps;
                existing.sink_caps |= summary.sink_caps;
                existing.propagates_taint |= summary.propagates_taint;
                for &idx in &summary.tainted_sink_params {
                    if !existing.tainted_sink_params.contains(&idx) {
                        existing.tainted_sink_params.push(idx);
                    }
                }
                for c in &summary.callees {
                    if !existing.callees.contains(c) {
                        existing.callees.push(c.clone());
                    }
                }
            })
            .or_insert(summary);

        let keys = self.by_lang_name.entry((lang, name)).or_default();
        if !keys.contains(&key) {
            keys.push(key);
        }
    }

    /// Exact lookup by fully-qualified key.
    pub fn get(&self, key: &FuncKey) -> Option<&FuncSummary> {
        self.by_key.get(key)
    }

    /// All same-language matches for a bare function name.
    pub fn lookup_same_lang(&self, lang: Lang, name: &str) -> Vec<(&FuncKey, &FuncSummary)> {
        self.by_lang_name
            .get(&(lang, name.to_string()))
            .map(|keys| {
                keys.iter()
                    .filter_map(|k| self.by_key.get(k).map(|v| (k, v)))
                    .collect()
            })
            .unwrap_or_default()
    }

    /// Merge another `GlobalSummaries` into this one (for parallel fold/reduce).
    pub fn merge(&mut self, other: GlobalSummaries) {
        for (key, summary) in other.by_key {
            self.insert(key, summary);
        }
    }

    #[allow(dead_code)]
    pub fn is_empty(&self) -> bool {
        self.by_key.is_empty()
    }

    /// Iterate over all (key, summary) pairs.
    #[allow(dead_code)]
    pub fn iter(&self) -> impl Iterator<Item = (&FuncKey, &FuncSummary)> {
        self.by_key.iter()
    }
}

impl std::fmt::Debug for GlobalSummaries {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("GlobalSummaries")
            .field("len", &self.by_key.len())
            .finish()
    }
}

/// Merge a set of per‑file summaries into a single `GlobalSummaries` map.
///
/// Merging only happens for exact `FuncKey` matches (same lang + namespace +
/// name + arity).  Functions with the same bare name but different languages
/// or namespaces are stored separately.
pub fn merge_summaries(
    per_file: impl IntoIterator<Item = FuncSummary>,
    scan_root: Option<&str>,
) -> GlobalSummaries {
    let mut map = GlobalSummaries::new();

    for fs in per_file {
        let key = fs.func_key(scan_root);
        map.insert(key, fs);
    }

    map
}

#[cfg(test)]
mod tests;