Skip to main content

normalize_languages/
python.rs

1//! Python language support.
2
3use crate::{ContainerBody, Import, Language, LanguageSymbols, Visibility};
4use tree_sitter::Node;
5
6// ============================================================================
7// Python language support
8// ============================================================================
9
10/// Python language support.
11pub struct Python;
12
13impl Language for Python {
14    fn name(&self) -> &'static str {
15        "Python"
16    }
17    fn extensions(&self) -> &'static [&'static str] {
18        &["py", "pyi", "pyw"]
19    }
20    fn grammar_name(&self) -> &'static str {
21        "python"
22    }
23
24    fn as_symbols(&self) -> Option<&dyn LanguageSymbols> {
25        Some(self)
26    }
27
28    fn extract_docstring(&self, node: &Node, content: &str) -> Option<String> {
29        extract_docstring(node, content)
30    }
31
32    fn extract_implements(&self, node: &Node, content: &str) -> crate::ImplementsInfo {
33        let mut implements = Vec::new();
34        if let Some(superclasses) = node.child_by_field_name("superclasses") {
35            let mut cursor = superclasses.walk();
36            for child in superclasses.children(&mut cursor) {
37                if child.kind() == "identifier" {
38                    implements.push(content[child.byte_range()].to_string());
39                }
40            }
41        }
42        crate::ImplementsInfo {
43            is_interface: false,
44            implements,
45        }
46    }
47
48    fn build_signature(&self, node: &Node, content: &str) -> String {
49        let name = match self.node_name(node, content) {
50            Some(n) => n,
51            None => {
52                return content[node.byte_range()]
53                    .lines()
54                    .next()
55                    .unwrap_or("")
56                    .trim()
57                    .to_string();
58            }
59        };
60
61        if node.kind() == "class_definition" {
62            let bases = node
63                .child_by_field_name("superclasses")
64                .map(|b| &content[b.byte_range()])
65                .unwrap_or("");
66            if bases.is_empty() {
67                format!("class {}", name)
68            } else {
69                format!("class {}{}", name, bases)
70            }
71        } else {
72            // function_definition / decorated_definition
73            let is_async = node
74                .child(0)
75                .map(|c| &content[c.byte_range()] == "async")
76                .unwrap_or(false);
77            let prefix = if is_async { "async def" } else { "def" };
78            let params = node
79                .child_by_field_name("parameters")
80                .map(|p| &content[p.byte_range()])
81                .unwrap_or("()");
82            let return_type = node
83                .child_by_field_name("return_type")
84                .map(|r| format!(" -> {}", &content[r.byte_range()]))
85                .unwrap_or_default();
86            format!("{} {}{}{}", prefix, name, params, return_type)
87        }
88    }
89
90    fn extract_imports(&self, node: &Node, content: &str) -> Vec<Import> {
91        let line = node.start_position().row + 1;
92
93        match node.kind() {
94            "import_statement" => {
95                // import foo, import foo as bar
96                let mut imports = Vec::new();
97                let mut cursor = node.walk();
98                for child in node.children(&mut cursor) {
99                    if child.kind() == "dotted_name" {
100                        let module = content[child.byte_range()].to_string();
101                        imports.push(Import {
102                            module,
103                            names: Vec::new(),
104                            alias: None,
105                            is_wildcard: false,
106                            is_relative: false,
107                            line,
108                        });
109                    } else if child.kind() == "aliased_import"
110                        && let Some(name) = child.child_by_field_name("name")
111                    {
112                        let module = content[name.byte_range()].to_string();
113                        let alias = child
114                            .child_by_field_name("alias")
115                            .map(|a| content[a.byte_range()].to_string());
116                        imports.push(Import {
117                            module,
118                            names: Vec::new(),
119                            alias,
120                            is_wildcard: false,
121                            is_relative: false,
122                            line,
123                        });
124                    }
125                }
126                imports
127            }
128            "import_from_statement" => {
129                // from foo import bar, baz
130                let module = node
131                    .child_by_field_name("module_name")
132                    .map(|m| content[m.byte_range()].to_string())
133                    .unwrap_or_default();
134
135                // Check for relative import (from . or from .. or from .foo)
136                let text = &content[node.byte_range()];
137                let is_relative = text.starts_with("from .");
138
139                let mut names = Vec::new();
140                let mut is_wildcard = false;
141                let module_end = node
142                    .child_by_field_name("module_name")
143                    .map(|m| m.end_byte())
144                    .unwrap_or(0);
145
146                let mut cursor = node.walk();
147                for child in node.children(&mut cursor) {
148                    match child.kind() {
149                        "dotted_name" | "identifier" if child.start_byte() > module_end => {
150                            names.push(content[child.byte_range()].to_string());
151                        }
152                        "aliased_import" => {
153                            if let Some(name) = child.child_by_field_name("name") {
154                                names.push(content[name.byte_range()].to_string());
155                            }
156                        }
157                        "wildcard_import" => {
158                            is_wildcard = true;
159                        }
160                        _ => {}
161                    }
162                }
163
164                vec![Import {
165                    module,
166                    names,
167                    alias: None,
168                    is_wildcard,
169                    is_relative,
170                    line,
171                }]
172            }
173            _ => Vec::new(),
174        }
175    }
176
177    fn format_import(&self, import: &Import, names: Option<&[&str]>) -> String {
178        let names_to_use: Vec<&str> = names
179            .map(|n| n.to_vec())
180            .unwrap_or_else(|| import.names.iter().map(|s| s.as_str()).collect());
181
182        if import.is_wildcard {
183            format!("from {} import *", import.module)
184        } else if names_to_use.is_empty() {
185            if let Some(ref alias) = import.alias {
186                format!("import {} as {}", import.module, alias)
187            } else {
188                format!("import {}", import.module)
189            }
190        } else {
191            format!("from {} import {}", import.module, names_to_use.join(", "))
192        }
193    }
194
195    fn extract_attributes(&self, node: &Node, content: &str) -> Vec<String> {
196        extract_decorators(node, content)
197    }
198
199    fn get_visibility(&self, node: &Node, content: &str) -> Visibility {
200        if let Some(name) = self.node_name(node, content) {
201            if name.starts_with("__") && name.ends_with("__") {
202                Visibility::Public // dunder methods
203            } else if name.starts_with("__") {
204                Visibility::Private // name mangled
205            } else if name.starts_with('_') {
206                Visibility::Protected // convention private
207            } else {
208                Visibility::Public
209            }
210        } else {
211            Visibility::Public
212        }
213    }
214
215    fn is_test_symbol(&self, symbol: &crate::Symbol) -> bool {
216        let name = symbol.name.as_str();
217        match symbol.kind {
218            crate::SymbolKind::Function | crate::SymbolKind::Method => name.starts_with("test_"),
219            crate::SymbolKind::Class => name.starts_with("Test") && name.len() > 4,
220            crate::SymbolKind::Module => name == "tests" || name == "test" || name == "__tests__",
221            _ => false,
222        }
223    }
224
225    fn test_file_globs(&self) -> &'static [&'static str] {
226        &["**/test_*.py", "**/*_test.py"]
227    }
228
229    fn extract_module_doc(&self, src: &str) -> Option<String> {
230        extract_python_module_doc(src)
231    }
232
233    fn body_has_docstring(&self, body: &Node, content: &str) -> bool {
234        let _ = content;
235        body.child(0)
236            .map(|c| {
237                c.kind() == "string"
238                    || (c.kind() == "expression_statement"
239                        && c.child(0).map(|n| n.kind() == "string").unwrap_or(false))
240            })
241            .unwrap_or(false)
242    }
243
244    fn container_body<'a>(&self, node: &'a Node<'a>) -> Option<Node<'a>> {
245        node.child_by_field_name("body")
246    }
247
248    fn analyze_container_body(
249        &self,
250        body_node: &Node,
251        content: &str,
252        inner_indent: &str,
253    ) -> Option<ContainerBody> {
254        let mut cursor = body_node.walk();
255        let children: Vec<_> = body_node.children(&mut cursor).collect();
256
257        if children.is_empty() {
258            return Some(ContainerBody {
259                content_start: body_node.start_byte(),
260                content_end: body_node.end_byte(),
261                inner_indent: inner_indent.to_string(),
262                is_empty: true,
263            });
264        }
265
266        let mut first_real_idx = 0;
267        for (i, child) in children.iter().enumerate() {
268            let is_docstring = if child.kind() == "expression_statement" {
269                let mut child_cursor = child.walk();
270                child
271                    .children(&mut child_cursor)
272                    .next()
273                    .map(|fc| fc.kind() == "string")
274                    .unwrap_or(false)
275            } else {
276                child.kind() == "string"
277            };
278            if is_docstring && i == 0 {
279                first_real_idx = i + 1;
280                continue;
281            }
282            break;
283        }
284
285        let is_empty = children.iter().skip(first_real_idx).all(|c| {
286            c.kind() == "pass_statement"
287                || c.kind() == "string"
288                || (c.kind() == "expression_statement"
289                    && c.child(0).map(|fc| fc.kind() == "string").unwrap_or(false))
290        });
291
292        let content_start = if first_real_idx < children.len() {
293            let child_start = children[first_real_idx].start_byte();
294            content[..child_start]
295                .rfind('\n')
296                .map(|i| i + 1)
297                .unwrap_or(child_start)
298        } else if !children.is_empty() {
299            // normalize-syntax-allow: rust/unwrap-in-impl - !children.is_empty() guarantees last() is Some
300            let last_end = children.last().unwrap().end_byte();
301            if last_end < content.len() && content.as_bytes()[last_end] == b'\n' {
302                last_end + 1
303            } else {
304                last_end
305            }
306        } else {
307            body_node.start_byte()
308        };
309
310        Some(ContainerBody {
311            content_start,
312            content_end: body_node.end_byte(),
313            inner_indent: inner_indent.to_string(),
314            is_empty,
315        })
316    }
317}
318
319impl LanguageSymbols for Python {}
320
321/// Extract the module-level docstring from Python source.
322///
323/// Skips shebang lines and coding-declaration comments, then looks for a
324/// triple-quoted string as the first non-comment, non-blank content.
325fn extract_python_module_doc(src: &str) -> Option<String> {
326    let mut lines = src.lines().peekable();
327    // Skip shebang and coding comments (PEP 263)
328    loop {
329        match lines.peek() {
330            Some(line) => {
331                let t = line.trim();
332                if t.starts_with("#!") || t.starts_with("# -*-") || t.starts_with("# coding") {
333                    lines.next();
334                } else {
335                    break;
336                }
337            }
338            None => return None,
339        }
340    }
341    let remaining: String = lines.collect::<Vec<_>>().join("\n");
342    let trimmed = remaining.trim_start();
343
344    // Must start with triple-quote string
345    let (quote, rest) = if let Some(rest) = trimmed.strip_prefix("\"\"\"") {
346        ("\"\"\"", rest)
347    } else if let Some(rest) = trimmed.strip_prefix("'''") {
348        ("'''", rest)
349    } else {
350        return None;
351    };
352
353    // Find the closing triple-quote
354    let end = rest.find(quote)?;
355    let doc = rest[..end].trim();
356    if doc.is_empty() {
357        None
358    } else {
359        Some(doc.to_string())
360    }
361}
362
363/// Extract a Python docstring from a function or class body.
364///
365/// Looks for the first statement in the body being a string literal.
366/// Handles both old grammar style (expression_statement > string) and
367/// new arborium style (string directly, with string_content child).
368fn extract_docstring(node: &Node, content: &str) -> Option<String> {
369    let body = node.child_by_field_name("body")?;
370    let first = body.child(0)?;
371
372    // Handle both grammar versions:
373    // - Old: expression_statement > string
374    // - New (arborium): string directly, with string_content child
375    let string_node = match first.kind() {
376        "string" => Some(first),
377        "expression_statement" => first.child(0).filter(|n| n.kind() == "string"),
378        _ => None,
379    }?;
380
381    // Try string_content child (arborium style)
382    let mut cursor = string_node.walk();
383    for child in string_node.children(&mut cursor) {
384        if child.kind() == "string_content" {
385            let doc = content[child.byte_range()].trim();
386            if !doc.is_empty() {
387                return Some(doc.to_string());
388            }
389        }
390    }
391
392    // Fallback: extract from full string text (old style)
393    let text = &content[string_node.byte_range()];
394    let doc = text
395        .trim_start_matches("\"\"\"")
396        .trim_start_matches("'''")
397        .trim_start_matches('"')
398        .trim_start_matches('\'')
399        .trim_end_matches("\"\"\"")
400        .trim_end_matches("'''")
401        .trim_end_matches('"')
402        .trim_end_matches('\'')
403        .trim();
404
405    if !doc.is_empty() {
406        Some(doc.to_string())
407    } else {
408        None
409    }
410}
411
412/// Extract decorators from a Python definition node.
413/// Python wraps decorated definitions in a `decorated_definition` parent node.
414/// The node passed here is `function_definition` or `class_definition`,
415/// so we look at the parent for `decorator` siblings.
416fn extract_decorators(node: &Node, content: &str) -> Vec<String> {
417    let mut attrs = Vec::new();
418    if let Some(parent) = node.parent()
419        && parent.kind() == "decorated_definition"
420    {
421        let mut cursor = parent.walk();
422        for child in parent.children(&mut cursor) {
423            if child.kind() == "decorator" {
424                attrs.push(content[child.byte_range()].to_string());
425            }
426        }
427    }
428    attrs
429}
430
431#[cfg(test)]
432mod tests {
433    use super::*;
434    use crate::GrammarLoader;
435    use tree_sitter::Parser;
436
437    struct ParseResult {
438        tree: tree_sitter::Tree,
439        #[allow(dead_code)]
440        loader: GrammarLoader,
441    }
442
443    fn parse_python(content: &str) -> ParseResult {
444        let loader = GrammarLoader::new();
445        let language = loader.get("python").ok().unwrap();
446        let mut parser = Parser::new();
447        parser.set_language(&language).unwrap();
448        ParseResult {
449            tree: parser.parse(content, None).unwrap(),
450            loader,
451        }
452    }
453
454    #[test]
455    fn test_python_extract_function() {
456        let support = Python;
457        let content = r#"def foo(x: int) -> str:
458    """Convert to string."""
459    return str(x)
460"#;
461        let result = parse_python(content);
462        let root = result.tree.root_node();
463
464        // Find function node
465        let mut cursor = root.walk();
466        let func = root
467            .children(&mut cursor)
468            .find(|n| n.kind() == "function_definition")
469            .unwrap();
470
471        let sig = support.build_signature(&func, content);
472        let doc = support.extract_docstring(&func, content);
473        assert_eq!(support.node_name(&func, content), Some("foo"));
474        assert!(sig.contains("def foo(x: int) -> str"));
475        assert_eq!(doc, Some("Convert to string.".to_string()));
476    }
477
478    #[test]
479    fn test_python_extract_class() {
480        let support = Python;
481        let content = r#"class Foo(Bar):
482    """A foo class."""
483    pass
484"#;
485        let result = parse_python(content);
486        let root = result.tree.root_node();
487
488        let mut cursor = root.walk();
489        let class = root
490            .children(&mut cursor)
491            .find(|n| n.kind() == "class_definition")
492            .unwrap();
493
494        let sig = support.build_signature(&class, content);
495        let doc = support.extract_docstring(&class, content);
496        assert_eq!(support.node_name(&class, content), Some("Foo"));
497        assert!(sig.contains("class Foo(Bar)"));
498        assert_eq!(doc, Some("A foo class.".to_string()));
499    }
500
501    #[test]
502    fn test_python_visibility() {
503        let support = Python;
504        let content = r#"def public(): pass
505def _protected(): pass
506def __private(): pass
507def __dunder__(): pass
508"#;
509        let result = parse_python(content);
510        let root = result.tree.root_node();
511
512        let mut cursor = root.walk();
513        let funcs: Vec<_> = root
514            .children(&mut cursor)
515            .filter(|n| n.kind() == "function_definition")
516            .collect();
517
518        assert_eq!(
519            support.get_visibility(&funcs[0], content),
520            Visibility::Public
521        );
522        assert_eq!(
523            support.get_visibility(&funcs[1], content),
524            Visibility::Protected
525        );
526        assert_eq!(
527            support.get_visibility(&funcs[2], content),
528            Visibility::Private
529        );
530        assert_eq!(
531            support.get_visibility(&funcs[3], content),
532            Visibility::Public
533        ); // dunder
534    }
535
536    /// Documents node kinds that exist in the Python grammar but aren't used in trait methods.
537    /// Each exclusion has a reason. Review periodically as features expand.
538    ///
539    /// Run `cross_check_node_kinds` in registry.rs to see all potentially useful kinds.
540    #[test]
541    fn unused_node_kinds_audit() {
542        use crate::validate_unused_kinds_audit;
543
544        // Categories:
545        // - STRUCTURAL: Internal/wrapper nodes, not semantically meaningful on their own
546        // - CLAUSE: Sub-parts of statements, handled via parent (e.g., else_clause in if_statement)
547        // - EXPRESSION: Expressions don't create control flow/scope, we track statements
548        // - TYPE: Type annotation nodes, not relevant for current analysis
549        // - LEGACY: Python 2 compatibility, not worth supporting
550        // - OPERATOR: Operators within expressions, too granular
551        // - MAYBE: Potentially useful, to be added when needed
552
553        #[rustfmt::skip]
554        let documented_unused: &[&str] = &[
555            // STRUCTURAL
556            "aliased_import",          // used internally by extract_imports
557            "block",                   // generic block wrapper (duplicate in grammar)
558            "expression_list",         // comma-separated expressions              // too common, used everywhere
559            "import_prefix",           // dots in relative imports
560            "lambda_parameters",       // internal to lambda                  // root node of file
561            "parenthesized_expression",// grouping only
562            "relative_import",         // handled in extract_imports
563            "tuple_expression",        // comma-separated values
564            "wildcard_import",         // handled in extract_imports
565
566            // CLAUSE (sub-parts of statements)
567            "case_pattern",            // internal to case_clause
568            "class_pattern",           // pattern in match/case
569            "elif_clause",             // part of if_statement
570            "else_clause",             // part of if/for/while/try
571            "finally_clause",          // part of try_statement
572            "for_in_clause",           // internal to comprehensions
573            "if_clause",               // internal to comprehensions
574            "with_clause",             // internal to with_statement
575            "with_item",               // internal to with_statement
576
577            // EXPRESSION (don't affect control flow structure)
578            "await",                   // await keyword, not a statement
579            "format_expression",       // f-string interpolation
580            "format_specifier",        // f-string format spec
581            "named_expression",        // walrus operator :=
582            "yield",                   // yield keyword form
583
584            // TYPE (type annotations)
585            "constrained_type",        // type constraints
586            "generic_type",            // parameterized types
587            "member_type",             // attribute access in types
588            "splat_type",              // *args/**kwargs types
589            "type",                    // generic type node
590            "type_alias_statement",    // could track as symbol
591            "type_conversion",         // !r/!s/!a in f-strings
592            "type_parameter",          // generic type params
593            "typed_default_parameter", // param with type and default
594            "typed_parameter",         // param with type annotation
595            "union_type",              // X | Y union syntax
596
597            // OPERATOR
598            "binary_operator",         // +, -, *, /, etc.
599            "boolean_operator",        // and/or - handled in complexity_nodes as keywords
600            "comparison_operator",     // ==, <, >, etc.
601            "not_operator",            // not keyword
602            "unary_operator",          // -, +, ~
603
604            // LEGACY (Python 2)
605            "exec_statement",          // Python 2 exec
606            "print_statement",         // Python 2 print
607
608            // MAYBE: Potentially useful
609            "decorated_definition",    // wrapper for @decorator
610            "delete_statement",        // del statement
611            "future_import_statement", // from __future__
612            "global_statement",        // scope modifier
613            "nonlocal_statement",      // scope modifier
614            "pass_statement",          // no-op, detect empty bodies
615            // control flow — not extracted as symbols
616            "lambda",
617            "import_statement",
618            "continue_statement",
619            "raise_statement",
620            "case_clause",
621            "generator_expression",
622            "assert_statement",
623            "if_statement",
624            "while_statement",
625            "with_statement",
626            "try_statement",
627            "import_from_statement",
628            "return_statement",
629            "except_clause",
630            "dictionary_comprehension",
631            "conditional_expression",
632            "match_statement",
633            "set_comprehension",
634            "for_statement",
635            "list_comprehension",
636            "break_statement",
637        ];
638
639        validate_unused_kinds_audit(&Python, documented_unused)
640            .expect("Python unused node kinds audit failed");
641    }
642}