use anyhow::{Context, Result};
use tree_sitter::{Node, Parser, Query, QueryCursor};
use crate::syntax::parser::{CodeChunk, SyntaxParser};
pub struct PythonParser {
parser: Parser,
query: Query,
}
impl PythonParser {
pub fn new() -> Self {
let mut parser = Parser::new();
let language = tree_sitter_python::language();
parser
.set_language(&language)
.expect("Error loading Python grammar");
let query = Query::new(
&language,
r#"
[
(function_definition) @func
(class_definition) @class
(decorated_definition) @decorator
(import_statement) @import
(import_from_statement) @import_from
(if_statement) @if
(for_statement) @for
(while_statement) @while
(try_statement) @try
(with_statement) @with
(expression_statement) @expr_stmt
]
"#,
)
.expect("Error creating Python query");
PythonParser { parser, query }
}
fn node_to_chunk(
&self,
node: Node,
code: &str,
file_path: &str,
capture_name: &str, ) -> Option<CodeChunk> {
let start_byte = node.start_byte();
let end_byte = node.end_byte();
let content = code.get(start_byte..end_byte)?.to_string();
let start_line = node.start_position().row + 1;
let end_line = node.end_position().row + 1;
let element_type = match capture_name {
"func" => "function",
"class" => "class",
"decorator" => {
let mut cursor = node.walk();
if node.children(&mut cursor).any(|n| n.kind() == "function_definition") {
"function"
} else if node.children(&mut cursor).any(|n| n.kind() == "class_definition") {
"class"
} else {
"decorated_definition" }
},
"import" | "import_from" | "if" | "for" | "while" | "try" | "with" | "expr_stmt" => "statement",
_ => "unknown", };
Some(CodeChunk {
content,
file_path: file_path.to_string(),
start_line,
end_line,
language: "python".to_string(),
element_type: element_type.to_string(),
})
}
}
impl SyntaxParser for PythonParser {
fn parse(&mut self, code: &str, file_path: &str) -> Result<Vec<CodeChunk>> {
let tree = self
.parser
.parse(code, None)
.context("Failed to parse Python code")?;
let root_node = tree.root_node();
let mut chunks = Vec::new();
let mut cursor = QueryCursor::new();
let code_bytes = code.as_bytes();
let is_docstring = |node: Node| -> bool {
if let Some(parent) = node.parent() {
if parent.kind() == "module" {
if parent.named_child(0) == Some(node) {
if node.kind() == "expression_statement" && node.named_child_count() == 1 {
if let Some(inner) = node.named_child(0) {
return inner.kind() == "string";
}
}
}
}
}
false
};
let is_pass_stmt = |node: Node, code_bytes: &[u8]| -> bool {
if node.kind() == "expression_statement" && node.child_count() == 1 {
if let Some(inner) = node.child(0) {
if inner.kind() == "identifier" {
return code_bytes.get(inner.start_byte()..inner.end_byte()) == Some(b"pass");
}
}
}
false
};
let matches = cursor.matches(&self.query, root_node, code_bytes);
for mat in matches {
for capture in mat.captures {
let node = capture.node;
let capture_name = self.query.capture_names()[capture.index as usize];
if node.parent().map_or(true, |p| p.kind() != "module") {
continue; }
if capture_name == "expr_stmt" {
if is_docstring(node) || is_pass_stmt(node, code_bytes) {
continue; }
}
if let Some(chunk) = self.node_to_chunk(node, code, file_path, capture_name) {
chunks.push(chunk);
}
}
}
chunks.sort_by_key(|c| c.start_line);
if chunks.is_empty() && !code.trim().is_empty() {
log::debug!(
"No top-level Python elements found in {}, indexing as whole file.",
file_path
);
chunks.push(CodeChunk {
content: code.to_string(),
file_path: file_path.to_string(),
start_line: 1,
end_line: code.lines().count(),
language: "python".to_string(),
element_type: "file".to_string(),
});
}
Ok(chunks)
}
}