repotoire 0.7.1

Graph-powered code analysis CLI. 110 detectors for security, architecture, bus factor, and code quality.
Documentation
//! Python import alias resolution for AST-first detectors.
//!
//! Walks top-level import nodes and builds maps that detectors use to
//! recognize aliased call shapes without any whole-program analysis.
//!
//! Two complementary walkers are exposed:
//!
//! - [`collect_python_from_imports`] handles `from M import X [as Y]`.
//!   Originally implemented inline in `insecure_crypto.rs` (commit
//!   `32021903`); extracted here once the same pattern was needed by
//!   `eval_detector`, `command_injection`, and `pickle_detector`
//!   (commit `a6d7ed2d`).
//! - [`collect_python_module_aliases`] handles `import M [as N]`. The
//!   from-import walker was insufficient for `import hashlib as hl;
//!   hl.md5(data)` — `hl.md5` parses as an attribute call whose object
//!   text is `"hl"`, not `"hashlib"`. This walker lets each detector's
//!   attribute-call branch resolve `hl` back to `hashlib`.
//!
//! Both maps are independent and never interfere: the from-import map
//! is keyed on names that appear as bare-identifier callees, the
//! module-aliases map is keyed on names that appear as the OBJECT of an
//! attribute callee. Each detector consults the relevant map for the
//! call shape it's seeing.
//!
//! ## Edge cases (shared by both walkers)
//!
//! - `from M import *` — wildcard, unresolvable; skipped.
//! - `from . import X` / `from ..pkg import X` — relative import with
//!   no usable module name; skipped (we only emit entries when the
//!   `module_name` field is a non-empty dotted name).
//! - Function-local / conditional imports — only top-level
//!   `import_statement` / `import_from_statement` children of the
//!   module root are walked.
//! - Name imported from multiple modules — last write wins.
//! - `from M import X as Y` — `Y` (the local alias) maps to `M`.
//! - `import M as N` — `N` (the local alias) maps to `M`.
//! - `import os.path` — dotted module; emits `{"os.path": "os.path"}`.
//!   Detector matchers typically check the first segment only, so this
//!   is mostly inert for security detection but kept for correctness.

use std::collections::HashMap;

use tree_sitter::Node;

/// Bundle of Python from-import alias maps used by AST matchers across
/// security detectors.
///
/// `imports` maps `local_name -> module.symbol` (built by
/// [`collect_python_from_imports`]); `modules` maps `local_alias ->
/// module_name` (built by [`collect_python_module_aliases`]). Every
/// matcher that consumes one consumes both, so they travel as a unit.
///
/// The bundle borrows the underlying maps; construct via
/// [`PythonAliases::new`] at the highest scope where both maps exist
/// (typically right after building them in `scan_file_ast`) and pass
/// `&aliases` down through the recursive `collect_*_sites` walkers.
pub(super) struct PythonAliases<'a> {
    pub imports: &'a HashMap<String, String>,
    pub modules: &'a HashMap<String, String>,
}

impl<'a> PythonAliases<'a> {
    pub(super) fn new(
        imports: &'a HashMap<String, String>,
        modules: &'a HashMap<String, String>,
    ) -> Self {
        Self { imports, modules }
    }
}

/// Walk top-level `import_from_statement` nodes and build a map of
/// `local_name → module`.
///
/// E.g. `from hashlib import md5, sha1 as sha_one` produces:
///   - `"md5" → "hashlib"`
///   - `"sha_one" → "hashlib"` (the aliased local name maps to the module)
pub(super) fn collect_python_from_imports(
    root: Node<'_>,
    source: &[u8],
) -> HashMap<String, String> {
    let mut map = HashMap::new();
    let mut cursor = root.walk();
    for top in root.children(&mut cursor) {
        if top.kind() != "import_from_statement" {
            continue;
        }
        let module = top
            .child_by_field_name("module_name")
            .and_then(|m| node_text(m, source))
            .unwrap_or("")
            .to_string();
        if module.is_empty() {
            continue;
        }
        let module_name_id = top.child_by_field_name("module_name").map(|n| n.id());
        let mut nc = top.walk();
        for child in top.children(&mut nc) {
            if !child.is_named() {
                continue;
            }
            // Skip the module-name node itself.
            if Some(child.id()) == module_name_id {
                continue;
            }
            match child.kind() {
                "dotted_name" => {
                    if let Some(name) = node_text(child, source) {
                        map.insert(name.to_string(), module.clone());
                    }
                }
                "aliased_import" => {
                    // `alias` field = local name; that's what we resolve.
                    let alias = child
                        .child_by_field_name("alias")
                        .and_then(|n| node_text(n, source));
                    if let Some(local) = alias {
                        map.insert(local.to_string(), module.clone());
                    }
                }
                _ => {}
            }
        }
    }
    map
}

/// Walk top-level `import_statement` nodes and build a map of
/// `local_alias → canonical_module_name`.
///
/// Handles every shape tree-sitter-python emits for `import …`:
///
/// | Source                          | Map                                    |
/// |---------------------------------|----------------------------------------|
/// | `import hashlib`                | `{"hashlib": "hashlib"}`               |
/// | `import hashlib as hl`          | `{"hl": "hashlib"}`                    |
/// | `import os, sys`                | `{"os": "os", "sys": "sys"}`           |
/// | `import os as o, sys as s`      | `{"o": "os", "s": "sys"}`              |
/// | `import os.path`                | `{"os.path": "os.path"}`               |
/// | `import os.path as op`          | `{"op": "os.path"}`                    |
///
/// The identity entries (`{"hashlib": "hashlib"}`) are intentional: the
/// caller can do a single `module_aliases.get(label)` lookup without
/// branching on whether the receiver was aliased. If no entry is found,
/// fall back to the literal label.
///
/// Tree-sitter-python emits each `import_statement` as a node whose
/// named children are either `dotted_name` (unaliased) or
/// `aliased_import` (with `name` and `alias` fields). The same shape
/// conventions as [`collect_python_from_imports`] apply.
pub(super) fn collect_python_module_aliases(
    root: Node<'_>,
    source: &[u8],
) -> HashMap<String, String> {
    let mut map = HashMap::new();
    let mut cursor = root.walk();
    for top in root.children(&mut cursor) {
        if top.kind() != "import_statement" {
            continue;
        }
        let mut nc = top.walk();
        for child in top.children(&mut nc) {
            if !child.is_named() {
                continue;
            }
            match child.kind() {
                "dotted_name" => {
                    if let Some(name) = node_text(child, source) {
                        // Identity entry: `import hashlib` → "hashlib" → "hashlib".
                        map.insert(name.to_string(), name.to_string());
                    }
                }
                "aliased_import" => {
                    let module = child
                        .child_by_field_name("name")
                        .and_then(|n| node_text(n, source));
                    let alias = child
                        .child_by_field_name("alias")
                        .and_then(|n| node_text(n, source));
                    if let (Some(module), Some(alias)) = (module, alias) {
                        map.insert(alias.to_string(), module.to_string());
                    }
                }
                _ => {}
            }
        }
    }
    map
}

fn node_text<'a>(node: Node<'_>, source: &'a [u8]) -> Option<&'a str> {
    let start = node.start_byte();
    let end = node.end_byte().min(source.len());
    std::str::from_utf8(&source[start..end]).ok()
}

#[cfg(test)]
mod tests {
    use super::*;
    use tree_sitter::Parser;

    fn parse_python(src: &str) -> tree_sitter::Tree {
        let mut parser = Parser::new();
        parser
            .set_language(&tree_sitter_python::LANGUAGE.into())
            .expect("load python grammar");
        parser.parse(src, None).expect("parse")
    }

    // ----- collect_python_from_imports -----

    #[test]
    fn from_imports_simple() {
        let src = "from hashlib import md5\n";
        let tree = parse_python(src);
        let map = collect_python_from_imports(tree.root_node(), src.as_bytes());
        assert_eq!(map.get("md5"), Some(&"hashlib".to_string()));
    }

    #[test]
    fn from_imports_with_alias() {
        let src = "from hashlib import md5 as m\n";
        let tree = parse_python(src);
        let map = collect_python_from_imports(tree.root_node(), src.as_bytes());
        assert_eq!(map.get("m"), Some(&"hashlib".to_string()));
        // Original name should NOT be in the map when only the alias was bound.
        assert!(!map.contains_key("md5"));
    }

    #[test]
    fn from_imports_multi_with_alias() {
        let src = "from hashlib import md5, sha1 as sha_one\n";
        let tree = parse_python(src);
        let map = collect_python_from_imports(tree.root_node(), src.as_bytes());
        assert_eq!(map.get("md5"), Some(&"hashlib".to_string()));
        assert_eq!(map.get("sha_one"), Some(&"hashlib".to_string()));
    }

    // ----- collect_python_module_aliases -----

    #[test]
    fn module_aliases_unaliased_identity() {
        let src = "import hashlib\n";
        let tree = parse_python(src);
        let map = collect_python_module_aliases(tree.root_node(), src.as_bytes());
        assert_eq!(map.get("hashlib"), Some(&"hashlib".to_string()));
    }

    #[test]
    fn module_aliases_aliased() {
        let src = "import hashlib as hl\n";
        let tree = parse_python(src);
        let map = collect_python_module_aliases(tree.root_node(), src.as_bytes());
        assert_eq!(map.get("hl"), Some(&"hashlib".to_string()));
        // Without `import hashlib`, the canonical name is NOT present.
        assert!(!map.contains_key("hashlib"));
    }

    #[test]
    fn module_aliases_multi_in_one_statement() {
        let src = "import os, sys\n";
        let tree = parse_python(src);
        let map = collect_python_module_aliases(tree.root_node(), src.as_bytes());
        assert_eq!(map.get("os"), Some(&"os".to_string()));
        assert_eq!(map.get("sys"), Some(&"sys".to_string()));
    }

    #[test]
    fn module_aliases_multi_aliased() {
        let src = "import os as o, sys as s\n";
        let tree = parse_python(src);
        let map = collect_python_module_aliases(tree.root_node(), src.as_bytes());
        assert_eq!(map.get("o"), Some(&"os".to_string()));
        assert_eq!(map.get("s"), Some(&"sys".to_string()));
    }

    #[test]
    fn module_aliases_dotted() {
        let src = "import os.path\n";
        let tree = parse_python(src);
        let map = collect_python_module_aliases(tree.root_node(), src.as_bytes());
        assert_eq!(map.get("os.path"), Some(&"os.path".to_string()));
    }

    #[test]
    fn module_aliases_dotted_aliased() {
        let src = "import os.path as op\n";
        let tree = parse_python(src);
        let map = collect_python_module_aliases(tree.root_node(), src.as_bytes());
        assert_eq!(map.get("op"), Some(&"os.path".to_string()));
    }

    #[test]
    fn module_aliases_ignores_function_local_imports() {
        let src = "def f():\n    import hashlib as hl\n    return hl\n";
        let tree = parse_python(src);
        let map = collect_python_module_aliases(tree.root_node(), src.as_bytes());
        // Function-local imports must not pollute the top-level map.
        assert!(map.is_empty(), "got: {:?}", map);
    }

    #[test]
    fn module_aliases_ignores_from_imports() {
        let src = "from os import system\n";
        let tree = parse_python(src);
        let map = collect_python_module_aliases(tree.root_node(), src.as_bytes());
        assert!(map.is_empty());
    }

    #[test]
    fn from_imports_ignores_plain_import() {
        let src = "import hashlib as hl\n";
        let tree = parse_python(src);
        let map = collect_python_from_imports(tree.root_node(), src.as_bytes());
        assert!(map.is_empty());
    }

    #[test]
    fn module_aliases_alias_shadows_real_module() {
        // `safelib as os` — `os` now resolves to `safelib`, not the
        // stdlib `os`. Detectors should NOT flag `os.system(...)` here.
        let src = "import safelib as os\n";
        let tree = parse_python(src);
        let map = collect_python_module_aliases(tree.root_node(), src.as_bytes());
        assert_eq!(map.get("os"), Some(&"safelib".to_string()));
    }
}