Skip to main content

qex_core/chunk/languages/
python.rs

1use super::{extract_preceding_comments, find_name, NodeMetadata, LanguageChunker};
2use crate::chunk::ChunkType;
3
4pub struct PythonChunker;
5
6impl LanguageChunker for PythonChunker {
7    fn tree_sitter_language(&self) -> tree_sitter::Language {
8        tree_sitter_python::LANGUAGE.into()
9    }
10
11    fn language_name(&self) -> &str {
12        "python"
13    }
14
15    fn file_extensions(&self) -> &[&str] {
16        &["py", "pyi"]
17    }
18
19    fn is_splittable(&self, node_type: &str) -> bool {
20        matches!(
21            node_type,
22            "function_definition" | "class_definition" | "decorated_definition"
23        )
24    }
25
26    fn has_nested_chunks(&self, node_type: &str) -> bool {
27        matches!(node_type, "class_definition" | "decorated_definition")
28    }
29
30    fn classify_node(&self, node_type: &str, parent_name: Option<&str>) -> ChunkType {
31        match node_type {
32            "class_definition" => ChunkType::Class,
33            "function_definition" if parent_name.is_some() => ChunkType::Method,
34            "function_definition" => ChunkType::Function,
35            "decorated_definition" => ChunkType::Function,
36            _ => ChunkType::ModuleLevel,
37        }
38    }
39
40    fn extract_metadata(&self, node: tree_sitter::Node, source: &str) -> NodeMetadata {
41        let mut meta = NodeMetadata::default();
42
43        match node.kind() {
44            "function_definition" => {
45                meta.name = find_name(node, source);
46                meta.is_async = {
47                    // Check if preceded by "async" keyword
48                    let text = &source[node.start_byte()..node.end_byte()];
49                    text.starts_with("async ")
50                };
51                // Extract docstring from body
52                meta.docstring = extract_python_docstring(node, source);
53            }
54            "class_definition" => {
55                meta.name = find_name(node, source);
56                meta.docstring = extract_python_docstring(node, source);
57            }
58            "decorated_definition" => {
59                // Extract decorators
60                let mut cursor = node.walk();
61                for child in node.children(&mut cursor) {
62                    if child.kind() == "decorator" {
63                        let text = &source[child.start_byte()..child.end_byte()];
64                        meta.decorators.push(text.to_string());
65                    } else if child.kind() == "function_definition" || child.kind() == "class_definition" {
66                        let inner = self.extract_metadata(child, source);
67                        meta.name = inner.name;
68                        meta.docstring = inner.docstring;
69                        meta.is_async = inner.is_async;
70                    }
71                }
72            }
73            _ => {
74                meta.name = find_name(node, source);
75                meta.docstring = extract_preceding_comments(node, source);
76            }
77        }
78
79        meta
80    }
81}
82
83fn extract_python_docstring(node: tree_sitter::Node, source: &str) -> Option<String> {
84    // Look for block/body child, then first expression_statement with string
85    let mut cursor = node.walk();
86    for child in node.children(&mut cursor) {
87        if child.kind() == "block" {
88            let mut block_cursor = child.walk();
89            for stmt in child.children(&mut block_cursor) {
90                if stmt.kind() == "expression_statement" {
91                    let mut stmt_cursor = stmt.walk();
92                    for expr in stmt.children(&mut stmt_cursor) {
93                        if expr.kind() == "string" {
94                            let text = &source[expr.start_byte()..expr.end_byte()];
95                            let cleaned = text
96                                .trim_start_matches("\"\"\"")
97                                .trim_end_matches("\"\"\"")
98                                .trim_start_matches("'''")
99                                .trim_end_matches("'''")
100                                .trim();
101                            return Some(cleaned.to_string());
102                        }
103                    }
104                }
105                // Only check the first statement
106                break;
107            }
108        }
109    }
110    None
111}