Skip to main content

qex_core/chunk/
tree_sitter.rs

1use crate::chunk::languages::LanguageChunker;
2use crate::chunk::{ChunkType, CodeChunk};
3use anyhow::{Context, Result};
4use std::path::Path;
5
6/// Raw chunk extracted from tree-sitter before enrichment
7#[derive(Debug, Clone)]
8pub struct TreeSitterChunk {
9    pub content: String,
10    pub start_line: usize,
11    pub end_line: usize,
12    pub node_type: String,
13    pub chunk_type: ChunkType,
14    pub name: Option<String>,
15    pub parent_name: Option<String>,
16    pub docstring: Option<String>,
17    pub decorators: Vec<String>,
18    pub is_async: bool,
19}
20
21/// Tree-sitter based code parsing engine
22pub struct TreeSitterEngine;
23
24impl TreeSitterEngine {
25    /// Parse a source file and extract semantic chunks
26    pub fn parse_file(
27        source: &str,
28        file_path: &str,
29        relative_path: &str,
30        language: &str,
31        chunker: &dyn LanguageChunker,
32    ) -> Result<Vec<CodeChunk>> {
33        let mut parser = ::tree_sitter::Parser::new();
34        let ts_language = chunker.tree_sitter_language();
35        parser
36            .set_language(&ts_language)
37            .context("Failed to set tree-sitter language")?;
38
39        let tree = parser
40            .parse(source.as_bytes(), None)
41            .context("Failed to parse source")?;
42
43        let root = tree.root_node();
44        let mut raw_chunks = Vec::new();
45
46        Self::traverse_node(root, source, chunker, None, &mut raw_chunks);
47
48        // If no chunks found, create a single module-level chunk
49        if raw_chunks.is_empty() && !source.trim().is_empty() {
50            let line_count = source.lines().count();
51            raw_chunks.push(TreeSitterChunk {
52                content: source.to_string(),
53                start_line: 1,
54                end_line: line_count,
55                node_type: "module".to_string(),
56                chunk_type: ChunkType::ModuleLevel,
57                name: Path::new(relative_path)
58                    .file_stem()
59                    .and_then(|s| s.to_str())
60                    .map(String::from),
61                parent_name: None,
62                docstring: None,
63                decorators: Vec::new(),
64                is_async: false,
65            });
66        }
67
68        // Convert raw chunks to enriched CodeChunks
69        let folder_structure = CodeChunk::extract_folder_structure(relative_path);
70        let chunks = raw_chunks
71            .into_iter()
72            .map(|raw| {
73                let id = CodeChunk::generate_id(file_path, raw.start_line, raw.end_line, raw.name.as_deref());
74                let tags = CodeChunk::extract_tags(&raw.content, &raw.chunk_type);
75                let complexity_score = CodeChunk::compute_complexity(&raw.content);
76                let imports = Self::extract_imports(&raw.content, language);
77
78                CodeChunk {
79                    id,
80                    content: raw.content,
81                    chunk_type: raw.chunk_type,
82                    start_line: raw.start_line,
83                    end_line: raw.end_line,
84                    file_path: file_path.to_string(),
85                    relative_path: relative_path.to_string(),
86                    folder_structure: folder_structure.clone(),
87                    name: raw.name,
88                    parent_name: raw.parent_name,
89                    language: language.to_string(),
90                    docstring: raw.docstring,
91                    decorators: raw.decorators,
92                    imports,
93                    tags,
94                    complexity_score,
95                }
96            })
97            .collect();
98
99        Ok(chunks)
100    }
101
102    /// Recursively traverse tree-sitter nodes to find splittable chunks
103    fn traverse_node(
104        node: ::tree_sitter::Node,
105        source: &str,
106        chunker: &dyn LanguageChunker,
107        parent_name: Option<&str>,
108        chunks: &mut Vec<TreeSitterChunk>,
109    ) {
110        let node_type = node.kind();
111
112        if chunker.is_splittable(node_type) {
113            // Extract this node as a chunk
114            let start_byte = node.start_byte();
115            let end_byte = node.end_byte();
116            let content = &source[start_byte..end_byte];
117            let start_line = node.start_position().row + 1;
118            let end_line = node.end_position().row + 1;
119
120            let metadata = chunker.extract_metadata(node, source);
121            let chunk_type = chunker.classify_node(node_type, parent_name);
122
123            let chunk = TreeSitterChunk {
124                content: content.to_string(),
125                start_line,
126                end_line,
127                node_type: node_type.to_string(),
128                chunk_type,
129                name: metadata.name.clone(),
130                parent_name: parent_name.map(String::from),
131                docstring: metadata.docstring,
132                decorators: metadata.decorators,
133                is_async: metadata.is_async,
134            };
135
136            chunks.push(chunk);
137
138            // For classes/impls, also traverse children for nested methods
139            let current_name = metadata.name.as_deref().or(parent_name);
140            if chunker.has_nested_chunks(node_type) {
141                let mut cursor = node.walk();
142                for child in node.children(&mut cursor) {
143                    Self::traverse_node(child, source, chunker, current_name, chunks);
144                }
145            }
146        } else {
147            // Not splittable, recurse into children
148            let mut cursor = node.walk();
149            for child in node.children(&mut cursor) {
150                Self::traverse_node(child, source, chunker, parent_name, chunks);
151            }
152        }
153    }
154
155    /// Extract import statements from source
156    fn extract_imports(content: &str, language: &str) -> Vec<String> {
157        let mut imports = Vec::new();
158        for line in content.lines() {
159            let trimmed = line.trim();
160            let is_import = match language {
161                "python" => trimmed.starts_with("import ") || trimmed.starts_with("from "),
162                "javascript" | "typescript" | "tsx" | "jsx" => {
163                    trimmed.starts_with("import ") || trimmed.starts_with("require(")
164                }
165                "rust" => trimmed.starts_with("use ") || trimmed.starts_with("extern crate"),
166                "go" => trimmed.starts_with("import "),
167                "java" | "csharp" => trimmed.starts_with("import ") || trimmed.starts_with("using "),
168                "c" | "cpp" => trimmed.starts_with("#include"),
169                _ => false,
170            };
171            if is_import {
172                imports.push(trimmed.to_string());
173            }
174        }
175        imports
176    }
177}