repotoire 0.8.0

Graph-powered code analysis CLI. 110 detectors for security, architecture, bus factor, and code quality.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
//! Python import alias resolution for AST-first detectors.
//!
//! Walks top-level import nodes and builds maps that detectors use to
//! recognize aliased call shapes without any whole-program analysis.
//!
//! Two complementary walkers are exposed:
//!
//! - [`collect_python_from_imports`] handles `from M import X [as Y]`.
//!   Originally implemented inline in `insecure_crypto.rs` (commit
//!   `32021903`); extracted here once the same pattern was needed by
//!   `eval_detector`, `command_injection`, and `pickle_detector`
//!   (commit `a6d7ed2d`).
//! - [`collect_python_module_aliases`] handles `import M [as N]`. The
//!   from-import walker was insufficient for `import hashlib as hl;
//!   hl.md5(data)` — `hl.md5` parses as an attribute call whose object
//!   text is `"hl"`, not `"hashlib"`. This walker lets each detector's
//!   attribute-call branch resolve `hl` back to `hashlib`.
//!
//! Both maps are independent and never interfere: the from-import map
//! is keyed on names that appear as bare-identifier callees, the
//! module-aliases map is keyed on names that appear as the OBJECT of an
//! attribute callee. Each detector consults the relevant map for the
//! call shape it's seeing.
//!
//! ## Edge cases (shared by both walkers)
//!
//! - `from M import *` — wildcard, unresolvable; skipped.
//! - `from . import X` / `from ..pkg import X` — relative import with
//!   no usable module name; skipped (we only emit entries when the
//!   `module_name` field is a non-empty dotted name).
//! - Function-local / conditional imports — only top-level
//!   `import_statement` / `import_from_statement` children of the
//!   module root are walked.
//! - Name imported from multiple modules — last write wins.
//! - `from M import X as Y` — `Y` (the local alias) maps to `M`.
//! - `import M as N` — `N` (the local alias) maps to `M`.
//! - `import os.path` — dotted module; emits `{"os.path": "os.path"}`.
//!   Detector matchers typically check the first segment only, so this
//!   is mostly inert for security detection but kept for correctness.

use std::collections::{HashMap, HashSet};

use tree_sitter::Node;

/// Bundle of Python from-import alias maps used by AST matchers across
/// security detectors.
///
/// `imports` maps `local_name -> module.symbol` (built by
/// [`collect_python_from_imports`]); `modules` maps `local_alias ->
/// module_name` (built by [`collect_python_module_aliases`]). Every
/// matcher that consumes one consumes both, so they travel as a unit.
///
/// The bundle borrows the underlying maps; construct via
/// [`PythonAliases::new`] at the highest scope where both maps exist
/// (typically right after building them in `scan_file_ast`) and pass
/// `&aliases` down through the recursive `collect_*_sites` walkers.
pub(super) struct PythonAliases<'a> {
    pub imports: &'a HashMap<String, String>,
    pub modules: &'a HashMap<String, String>,
}

impl<'a> PythonAliases<'a> {
    pub(super) fn new(
        imports: &'a HashMap<String, String>,
        modules: &'a HashMap<String, String>,
    ) -> Self {
        Self { imports, modules }
    }
}

/// Walk top-level `import_from_statement` nodes and build a map of
/// `local_name → module`.
///
/// E.g. `from hashlib import md5, sha1 as sha_one` produces:
///   - `"md5" → "hashlib"`
///   - `"sha_one" → "hashlib"` (the aliased local name maps to the module)
pub(super) fn collect_python_from_imports(
    root: Node<'_>,
    source: &[u8],
) -> HashMap<String, String> {
    let mut map = HashMap::new();
    let mut cursor = root.walk();
    for top in root.children(&mut cursor) {
        if top.kind() != "import_from_statement" {
            continue;
        }
        let module = top
            .child_by_field_name("module_name")
            .and_then(|m| node_text(m, source))
            .unwrap_or("")
            .to_string();
        if module.is_empty() {
            continue;
        }
        let module_name_id = top.child_by_field_name("module_name").map(|n| n.id());
        let mut nc = top.walk();
        for child in top.children(&mut nc) {
            if !child.is_named() {
                continue;
            }
            // Skip the module-name node itself.
            if Some(child.id()) == module_name_id {
                continue;
            }
            match child.kind() {
                "dotted_name" => {
                    if let Some(name) = node_text(child, source) {
                        map.insert(name.to_string(), module.clone());
                    }
                }
                "aliased_import" => {
                    // `alias` field = local name; that's what we resolve.
                    let alias = child
                        .child_by_field_name("alias")
                        .and_then(|n| node_text(n, source));
                    if let Some(local) = alias {
                        map.insert(local.to_string(), module.clone());
                    }
                }
                _ => {}
            }
        }
    }
    map
}

/// Walk top-level `import_statement` nodes and build a map of
/// `local_alias → canonical_module_name`.
///
/// Handles every shape tree-sitter-python emits for `import …`:
///
/// | Source                          | Map                                    |
/// |---------------------------------|----------------------------------------|
/// | `import hashlib`                | `{"hashlib": "hashlib"}`               |
/// | `import hashlib as hl`          | `{"hl": "hashlib"}`                    |
/// | `import os, sys`                | `{"os": "os", "sys": "sys"}`           |
/// | `import os as o, sys as s`      | `{"o": "os", "s": "sys"}`              |
/// | `import os.path`                | `{"os.path": "os.path"}`               |
/// | `import os.path as op`          | `{"op": "os.path"}`                    |
///
/// The identity entries (`{"hashlib": "hashlib"}`) are intentional: the
/// caller can do a single `module_aliases.get(label)` lookup without
/// branching on whether the receiver was aliased. If no entry is found,
/// fall back to the literal label.
///
/// Tree-sitter-python emits each `import_statement` as a node whose
/// named children are either `dotted_name` (unaliased) or
/// `aliased_import` (with `name` and `alias` fields). The same shape
/// conventions as [`collect_python_from_imports`] apply.
pub(super) fn collect_python_module_aliases(
    root: Node<'_>,
    source: &[u8],
) -> HashMap<String, String> {
    let mut map = HashMap::new();
    let mut cursor = root.walk();
    for top in root.children(&mut cursor) {
        if top.kind() != "import_statement" {
            continue;
        }
        let mut nc = top.walk();
        for child in top.children(&mut nc) {
            if !child.is_named() {
                continue;
            }
            match child.kind() {
                "dotted_name" => {
                    if let Some(name) = node_text(child, source) {
                        // Identity entry: `import hashlib` → "hashlib" → "hashlib".
                        map.insert(name.to_string(), name.to_string());
                    }
                }
                "aliased_import" => {
                    let module = child
                        .child_by_field_name("name")
                        .and_then(|n| node_text(n, source));
                    let alias = child
                        .child_by_field_name("alias")
                        .and_then(|n| node_text(n, source));
                    if let (Some(module), Some(alias)) = (module, alias) {
                        map.insert(alias.to_string(), module.to_string());
                    }
                }
                _ => {}
            }
        }
    }
    map
}

fn node_text<'a>(node: Node<'_>, source: &'a [u8]) -> Option<&'a str> {
    let start = node.start_byte();
    let end = node.end_byte().min(source.len());
    std::str::from_utf8(&source[start..end]).ok()
}

/// Per-file scan that finds class-attribute initializations of the form
/// `self.<attr> = <Constructor>(...)` inside class bodies, where
/// `<Constructor>` (after Python alias resolution) matches one of
/// `ctor_names`.
///
/// Returns a map: `class_name -> set_of_attribute_names_initialized_to_a_known_ctor`.
///
/// Used by detectors that need to suppress findings on
/// `self.<attr>.<method>(...)` calls when `<attr>` was assigned a specific
/// known type, but the enclosing class doesn't directly inherit from the
/// known type (e.g. composition patterns like `httpx.Cookies` which holds
/// `self.jar = CookieJar()` but inherits from `MutableMapping`).
///
/// `ctor_names` should contain BOTH bare names and dotted names that the
/// detector cares about, e.g. `["CookieJar", "RequestsCookieJar",
/// "http.cookiejar.CookieJar", "requests.cookies.RequestsCookieJar"]`.
///
/// Resolution rules:
/// - bare-name callee `Foo()` → look up `aliases.imports["Foo"]` (the
///   from-import map). If present, the resolved fully-qualified name
///   `<module>.<Foo>` (or the bare local name) is matched against
///   `ctor_names`.
/// - attribute callee `mod.Foo()` → look up `aliases.modules[mod_text]`
///   (module-alias map). If present, the resolved name `<module>.Foo`
///   (or the bare `Foo`) is matched. If no alias entry exists, fall back
///   to the literal call text.
///
/// # Placement rationale
///
/// This helper lives in `python_imports.rs` (alongside the alias
/// walkers) rather than in `ast_helpers.rs` because it needs
/// [`PythonAliases`] to resolve constructor names. Putting it here
/// preserves the `ast_helpers.rs` "depends only on `tree_sitter`"
/// invariant; the alternative (moving it to `ast_helpers.rs` and giving
/// that module a `super::python_imports` dependency) would create a
/// less-clean module dependency graph.
pub(super) fn class_attr_constructors_of(
    root: Node<'_>,
    source: &[u8],
    aliases: &PythonAliases<'_>,
    ctor_names: &HashSet<&str>,
) -> HashMap<String, HashSet<String>> {
    let mut out: HashMap<String, HashSet<String>> = HashMap::new();
    let mut stack = vec![root];
    while let Some(node) = stack.pop() {
        if node.kind() == "class_definition" {
            if let (Some(name_node), Some(body)) = (
                node.child_by_field_name("name"),
                node.child_by_field_name("body"),
            ) {
                if let Some(class_name) = node_text(name_node, source) {
                    let class_name = class_name.to_string();
                    collect_self_attr_ctors(
                        body,
                        source,
                        aliases,
                        ctor_names,
                        &class_name,
                        &mut out,
                    );
                }
            }
        }
        let mut c = node.walk();
        for child in node.children(&mut c) {
            stack.push(child);
        }
    }
    out
}

/// Walk a class body recursively and record `self.<attr> = <ctor>(...)`
/// assignments whose ctor resolves into `ctor_names`.
fn collect_self_attr_ctors(
    node: Node<'_>,
    source: &[u8],
    aliases: &PythonAliases<'_>,
    ctor_names: &HashSet<&str>,
    class_name: &str,
    out: &mut HashMap<String, HashSet<String>>,
) {
    if node.kind() == "assignment" {
        if let Some(attr_name) = self_attribute_name(node.child_by_field_name("left"), source) {
            if let Some(rhs) = node.child_by_field_name("right") {
                if rhs.kind() == "call"
                    && call_resolves_to_known_ctor(rhs, source, aliases, ctor_names)
                {
                    out.entry(class_name.to_string())
                        .or_default()
                        .insert(attr_name);
                }
            }
        }
    }
    let mut c = node.walk();
    for child in node.children(&mut c) {
        collect_self_attr_ctors(child, source, aliases, ctor_names, class_name, out);
    }
}

/// If `node` is an `attribute` of the shape `self.<name>`, return `<name>`.
fn self_attribute_name(node: Option<Node<'_>>, source: &[u8]) -> Option<String> {
    let n = node?;
    if n.kind() != "attribute" {
        return None;
    }
    let obj = n.child_by_field_name("object")?;
    let obj_text = node_text(obj, source)?;
    if obj_text != "self" {
        return None;
    }
    let attr = n.child_by_field_name("attribute")?;
    Some(node_text(attr, source)?.to_string())
}

/// Decide whether the callee of a `call` node resolves (after alias
/// lookup) to one of the names in `ctor_names`.
fn call_resolves_to_known_ctor(
    call: Node<'_>,
    source: &[u8],
    aliases: &PythonAliases<'_>,
    ctor_names: &HashSet<&str>,
) -> bool {
    let func = match call.child_by_field_name("function") {
        Some(f) => f,
        None => return false,
    };
    match func.kind() {
        "identifier" => {
            let name = match node_text(func, source) {
                Some(s) => s,
                None => return false,
            };
            if let Some(module) = aliases.imports.get(name) {
                let dotted = format!("{module}.{name}");
                if ctor_names.contains(dotted.as_str()) {
                    return true;
                }
            }
            ctor_names.contains(name)
        }
        "attribute" => {
            let attr = match func
                .child_by_field_name("attribute")
                .and_then(|n| node_text(n, source))
            {
                Some(s) => s,
                None => return false,
            };
            let obj = match func.child_by_field_name("object") {
                Some(o) => o,
                None => return false,
            };
            let obj_text = node_text(obj, source).unwrap_or("");
            // Resolve `cl.CookieJar` → if `cl` is a module alias, normalize.
            if let Some(resolved_module) = aliases.modules.get(obj_text) {
                let dotted = format!("{resolved_module}.{attr}");
                if ctor_names.contains(dotted.as_str()) {
                    return true;
                }
            }
            // Bare attribute fallback: try the literal full text or bare attr.
            if let Some(full) = node_text(func, source) {
                if ctor_names.contains(full) {
                    return true;
                }
            }
            ctor_names.contains(attr)
        }
        _ => false,
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use tree_sitter::Parser;

    fn parse_python(src: &str) -> tree_sitter::Tree {
        let mut parser = Parser::new();
        parser
            .set_language(&tree_sitter_python::LANGUAGE.into())
            .expect("load python grammar");
        parser.parse(src, None).expect("parse")
    }

    // ----- collect_python_from_imports -----

    #[test]
    fn from_imports_simple() {
        let src = "from hashlib import md5\n";
        let tree = parse_python(src);
        let map = collect_python_from_imports(tree.root_node(), src.as_bytes());
        assert_eq!(map.get("md5"), Some(&"hashlib".to_string()));
    }

    #[test]
    fn from_imports_with_alias() {
        let src = "from hashlib import md5 as m\n";
        let tree = parse_python(src);
        let map = collect_python_from_imports(tree.root_node(), src.as_bytes());
        assert_eq!(map.get("m"), Some(&"hashlib".to_string()));
        // Original name should NOT be in the map when only the alias was bound.
        assert!(!map.contains_key("md5"));
    }

    #[test]
    fn from_imports_multi_with_alias() {
        let src = "from hashlib import md5, sha1 as sha_one\n";
        let tree = parse_python(src);
        let map = collect_python_from_imports(tree.root_node(), src.as_bytes());
        assert_eq!(map.get("md5"), Some(&"hashlib".to_string()));
        assert_eq!(map.get("sha_one"), Some(&"hashlib".to_string()));
    }

    // ----- collect_python_module_aliases -----

    #[test]
    fn module_aliases_unaliased_identity() {
        let src = "import hashlib\n";
        let tree = parse_python(src);
        let map = collect_python_module_aliases(tree.root_node(), src.as_bytes());
        assert_eq!(map.get("hashlib"), Some(&"hashlib".to_string()));
    }

    #[test]
    fn module_aliases_aliased() {
        let src = "import hashlib as hl\n";
        let tree = parse_python(src);
        let map = collect_python_module_aliases(tree.root_node(), src.as_bytes());
        assert_eq!(map.get("hl"), Some(&"hashlib".to_string()));
        // Without `import hashlib`, the canonical name is NOT present.
        assert!(!map.contains_key("hashlib"));
    }

    #[test]
    fn module_aliases_multi_in_one_statement() {
        let src = "import os, sys\n";
        let tree = parse_python(src);
        let map = collect_python_module_aliases(tree.root_node(), src.as_bytes());
        assert_eq!(map.get("os"), Some(&"os".to_string()));
        assert_eq!(map.get("sys"), Some(&"sys".to_string()));
    }

    #[test]
    fn module_aliases_multi_aliased() {
        let src = "import os as o, sys as s\n";
        let tree = parse_python(src);
        let map = collect_python_module_aliases(tree.root_node(), src.as_bytes());
        assert_eq!(map.get("o"), Some(&"os".to_string()));
        assert_eq!(map.get("s"), Some(&"sys".to_string()));
    }

    #[test]
    fn module_aliases_dotted() {
        let src = "import os.path\n";
        let tree = parse_python(src);
        let map = collect_python_module_aliases(tree.root_node(), src.as_bytes());
        assert_eq!(map.get("os.path"), Some(&"os.path".to_string()));
    }

    #[test]
    fn module_aliases_dotted_aliased() {
        let src = "import os.path as op\n";
        let tree = parse_python(src);
        let map = collect_python_module_aliases(tree.root_node(), src.as_bytes());
        assert_eq!(map.get("op"), Some(&"os.path".to_string()));
    }

    #[test]
    fn module_aliases_ignores_function_local_imports() {
        let src = "def f():\n    import hashlib as hl\n    return hl\n";
        let tree = parse_python(src);
        let map = collect_python_module_aliases(tree.root_node(), src.as_bytes());
        // Function-local imports must not pollute the top-level map.
        assert!(map.is_empty(), "got: {:?}", map);
    }

    #[test]
    fn module_aliases_ignores_from_imports() {
        let src = "from os import system\n";
        let tree = parse_python(src);
        let map = collect_python_module_aliases(tree.root_node(), src.as_bytes());
        assert!(map.is_empty());
    }

    #[test]
    fn from_imports_ignores_plain_import() {
        let src = "import hashlib as hl\n";
        let tree = parse_python(src);
        let map = collect_python_from_imports(tree.root_node(), src.as_bytes());
        assert!(map.is_empty());
    }

    #[test]
    fn module_aliases_alias_shadows_real_module() {
        // `safelib as os` — `os` now resolves to `safelib`, not the
        // stdlib `os`. Detectors should NOT flag `os.system(...)` here.
        let src = "import safelib as os\n";
        let tree = parse_python(src);
        let map = collect_python_module_aliases(tree.root_node(), src.as_bytes());
        assert_eq!(map.get("os"), Some(&"safelib".to_string()));
    }

    // ----- class_attr_constructors_of -----

    fn cookie_ctor_names() -> HashSet<&'static str> {
        HashSet::from([
            "CookieJar",
            "RequestsCookieJar",
            "http.cookiejar.CookieJar",
            "requests.cookies.RequestsCookieJar",
        ])
    }

    #[test]
    fn class_attr_ctors_bare_name_from_import() {
        // Test 1 of the design-doc spec: from-import + bare-name ctor.
        let src = "from http.cookiejar import CookieJar\nclass C:\n    def __init__(self):\n        self.jar = CookieJar()\n";
        let tree = parse_python(src);
        let imports = collect_python_from_imports(tree.root_node(), src.as_bytes());
        let modules = collect_python_module_aliases(tree.root_node(), src.as_bytes());
        let aliases = PythonAliases::new(&imports, &modules);
        let ctors = cookie_ctor_names();
        let map = class_attr_constructors_of(tree.root_node(), src.as_bytes(), &aliases, &ctors);
        let attrs = map.get("C").expect("C should be present");
        assert!(attrs.contains("jar"), "expected `jar` in {attrs:?}");
    }

    #[test]
    fn class_attr_ctors_module_alias() {
        // Test 2 of the design-doc spec: `import http.cookiejar as cl;
        // self.jar = cl.CookieJar()`.
        let src = "import http.cookiejar as cl\nclass C:\n    def __init__(self):\n        self.jar = cl.CookieJar()\n";
        let tree = parse_python(src);
        let imports = collect_python_from_imports(tree.root_node(), src.as_bytes());
        let modules = collect_python_module_aliases(tree.root_node(), src.as_bytes());
        let aliases = PythonAliases::new(&imports, &modules);
        let ctors = cookie_ctor_names();
        let map = class_attr_constructors_of(tree.root_node(), src.as_bytes(), &aliases, &ctors);
        let attrs = map.get("C").expect("C should be present");
        assert!(attrs.contains("jar"), "expected `jar` in {attrs:?}");
    }

    #[test]
    fn class_attr_ctors_only_some_match() {
        // Test 3: `self.jar = CookieJar()` matches; `self.timeout = 30`
        // does not (not a known ctor call).
        let src = "from http.cookiejar import CookieJar\nclass C:\n    def __init__(self):\n        self.jar = CookieJar()\n        self.timeout = 30\n";
        let tree = parse_python(src);
        let imports = collect_python_from_imports(tree.root_node(), src.as_bytes());
        let modules = collect_python_module_aliases(tree.root_node(), src.as_bytes());
        let aliases = PythonAliases::new(&imports, &modules);
        let ctors = cookie_ctor_names();
        let map = class_attr_constructors_of(tree.root_node(), src.as_bytes(), &aliases, &ctors);
        let attrs = map.get("C").expect("C should be present");
        assert!(attrs.contains("jar"));
        assert!(!attrs.contains("timeout"));
        assert_eq!(attrs.len(), 1);
    }

    #[test]
    fn class_attr_ctors_unknown_ctor_empty_map() {
        // Test 4: `self.x = SomethingElse()` — ctor not in our list.
        let src = "from collections import OrderedDict\nclass C:\n    def __init__(self):\n        self.data = OrderedDict()\n";
        let tree = parse_python(src);
        let imports = collect_python_from_imports(tree.root_node(), src.as_bytes());
        let modules = collect_python_module_aliases(tree.root_node(), src.as_bytes());
        let aliases = PythonAliases::new(&imports, &modules);
        let ctors = cookie_ctor_names();
        let map = class_attr_constructors_of(tree.root_node(), src.as_bytes(), &aliases, &ctors);
        assert!(map.is_empty(), "expected empty map, got: {map:?}");
    }
}