Skip to main content

dbg_cli/session_db/canonicalizer/
python.rs

1//! Python (CPython / PyPy) symbol canonicalization.
2//!
3//! Raw forms seen in the wild:
4//!   * cProfile pstats: `/path/to/app.py:42(my_func)`
5//!     → fqn=`app.my_func`, file=`/path/to/app.py`, line=42
6//!   * py-spy / austin:   `my_func (app.py)`   → fqn=`app.my_func`, file=`app.py`
7//!   * dotted already:    `app.submodule.my_func` → fqn as-is
8//!   * Built-ins:         `<built-in method builtins.print>` → fqn=`builtins.print`
9//!
10//! Lambdas and comprehensions — `<lambda>`, `<listcomp>`, `<genexpr>` —
11//! get `is_synthetic = true` because their line-number-derived identity
12//! is not stable across edits.
13
14use std::path::Path;
15use std::sync::OnceLock;
16
17use regex::Regex;
18
19use super::{CanonicalSymbol, Canonicalizer};
20
21pub struct PythonCanonicalizer;
22
23impl Canonicalizer for PythonCanonicalizer {
24    fn lang(&self) -> &'static str {
25        "python"
26    }
27
28    fn canonicalize(&self, raw: &str) -> CanonicalSymbol {
29        let parsed = parse(raw);
30        CanonicalSymbol {
31            lang: "python",
32            fqn: parsed.fqn,
33            file: parsed.file,
34            line: parsed.line,
35            demangled: None,
36            raw: raw.to_string(),
37            is_synthetic: parsed.synthetic,
38        }
39    }
40}
41
42struct Parsed {
43    fqn: String,
44    file: Option<String>,
45    line: Option<u32>,
46    synthetic: bool,
47}
48
49fn parse(raw: &str) -> Parsed {
50    // `<built-in method builtins.print>`  → `builtins.print`
51    if let Some(inner) = raw
52        .strip_prefix("<built-in method ")
53        .and_then(|s| s.strip_suffix('>'))
54    {
55        return Parsed {
56            fqn: inner.to_string(),
57            file: None,
58            line: None,
59            synthetic: false,
60        };
61    }
62    // `<method 'write' of 'BufferedWriter' objects>` → `BufferedWriter.write`
63    static METHOD_OF: OnceLock<Regex> = OnceLock::new();
64    let re_method_of = METHOD_OF
65        .get_or_init(|| Regex::new(r"^<method '(?P<m>[^']+)' of '(?P<t>[^']+)' objects>$").unwrap());
66    if let Some(c) = re_method_of.captures(raw) {
67        return Parsed {
68            fqn: format!("{}.{}", &c["t"], &c["m"]),
69            file: None,
70            line: None,
71            synthetic: false,
72        };
73    }
74
75    // pstats form: `<file>:<line>(<func>)`
76    static PSTATS: OnceLock<Regex> = OnceLock::new();
77    let re_pstats = PSTATS.get_or_init(|| {
78        Regex::new(r"^(?P<file>[^\s\(]+):(?P<line>\d+)\((?P<func>[^)]+)\)$").unwrap()
79    });
80    if let Some(c) = re_pstats.captures(raw) {
81        let file = c["file"].to_string();
82        let line: u32 = c["line"].parse().ok().unwrap_or(0);
83        let func = c["func"].to_string();
84        let module = module_from_file(&file);
85        let fqn = if module.is_empty() {
86            func.clone()
87        } else {
88            format!("{module}.{func}")
89        };
90        let synthetic = is_synthetic_func(&func);
91        return Parsed { fqn, file: Some(file), line: Some(line), synthetic };
92    }
93
94    // py-spy form: `my_func (app.py)` / `my_func (app.py:42)`
95    static PYSPY: OnceLock<Regex> = OnceLock::new();
96    let re_pyspy = PYSPY.get_or_init(|| {
97        Regex::new(r"^(?P<func>[A-Za-z_<][\w<>]*)\s+\((?P<file>[^:)]+)(?::(?P<line>\d+))?\)$").unwrap()
98    });
99    if let Some(c) = re_pyspy.captures(raw) {
100        let func = c["func"].to_string();
101        let file = c["file"].to_string();
102        let line: Option<u32> = c.name("line").and_then(|m| m.as_str().parse().ok());
103        let module = module_from_file(&file);
104        let fqn = if module.is_empty() {
105            func.clone()
106        } else {
107            format!("{module}.{func}")
108        };
109        let synthetic = is_synthetic_func(&func);
110        return Parsed { fqn, file: Some(file), line, synthetic };
111    }
112
113    // Bare dotted or bare function name.
114    let synthetic = is_synthetic_func(raw);
115    Parsed {
116        fqn: raw.to_string(),
117        file: None,
118        line: None,
119        synthetic,
120    }
121}
122
123fn module_from_file(file: &str) -> String {
124    // Drop directories, drop `.py` suffix. Preserve package-ish dots only
125    // if the filename already contains them (rare).
126    Path::new(file)
127        .file_stem()
128        .and_then(|s| s.to_str())
129        .unwrap_or("")
130        .to_string()
131}
132
133fn is_synthetic_func(f: &str) -> bool {
134    matches!(f, "<lambda>" | "<listcomp>" | "<dictcomp>" | "<setcomp>" | "<genexpr>" | "<module>")
135}
136
137#[cfg(test)]
138mod tests {
139    use super::*;
140
141    fn py() -> PythonCanonicalizer { PythonCanonicalizer }
142
143    #[test]
144    fn pstats_form_parsed() {
145        let s = py().canonicalize("/opt/myapp/api.py:42(handle_request)");
146        assert_eq!(s.fqn, "api.handle_request");
147        assert_eq!(s.file.as_deref(), Some("/opt/myapp/api.py"));
148        assert_eq!(s.line, Some(42));
149        assert!(!s.is_synthetic);
150    }
151
152    #[test]
153    fn pstats_lambda_is_synthetic() {
154        let s = py().canonicalize("/opt/myapp/api.py:42(<lambda>)");
155        assert!(s.is_synthetic);
156        assert_eq!(s.fqn, "api.<lambda>");
157    }
158
159    #[test]
160    fn pyspy_form_with_line_parsed() {
161        let s = py().canonicalize("handle_request (api.py:42)");
162        assert_eq!(s.fqn, "api.handle_request");
163        assert_eq!(s.file.as_deref(), Some("api.py"));
164        assert_eq!(s.line, Some(42));
165    }
166
167    #[test]
168    fn pyspy_form_without_line_parsed() {
169        let s = py().canonicalize("handle_request (api.py)");
170        assert_eq!(s.fqn, "api.handle_request");
171        assert_eq!(s.line, None);
172    }
173
174    #[test]
175    fn bare_dotted_is_passed_through() {
176        let s = py().canonicalize("myapp.services.users.login");
177        assert_eq!(s.fqn, "myapp.services.users.login");
178        assert_eq!(s.file, None);
179    }
180
181    #[test]
182    fn builtin_method_form() {
183        let s = py().canonicalize("<built-in method builtins.print>");
184        assert_eq!(s.fqn, "builtins.print");
185    }
186
187    #[test]
188    fn method_of_form() {
189        let s = py().canonicalize("<method 'write' of 'BufferedWriter' objects>");
190        assert_eq!(s.fqn, "BufferedWriter.write");
191    }
192
193    #[test]
194    fn listcomp_synthetic() {
195        let s = py().canonicalize("/app/main.py:10(<listcomp>)");
196        assert!(s.is_synthetic);
197    }
198
199    #[test]
200    fn module_level_synthetic() {
201        let s = py().canonicalize("/app/main.py:1(<module>)");
202        assert!(s.is_synthetic);
203    }
204
205    #[test]
206    fn structured_default_joins_with_dot() {
207        let s = py().canonicalize_structured("", "UserService", "login", "");
208        assert_eq!(s.fqn, "UserService.login");
209    }
210
211    #[test]
212    fn key_is_lang_plus_fqn() {
213        let s = py().canonicalize("app.main");
214        assert_eq!(s.key(), ("python", "app.main"));
215    }
216}