1use crate::chunk::languages::LanguageChunker;
2use crate::chunk::{ChunkType, CodeChunk};
3use anyhow::{Context, Result};
4use std::path::Path;
5
6#[derive(Debug, Clone)]
8pub struct TreeSitterChunk {
9 pub content: String,
10 pub start_line: usize,
11 pub end_line: usize,
12 pub node_type: String,
13 pub chunk_type: ChunkType,
14 pub name: Option<String>,
15 pub parent_name: Option<String>,
16 pub docstring: Option<String>,
17 pub decorators: Vec<String>,
18 pub is_async: bool,
19}
20
21pub struct TreeSitterEngine;
23
24impl TreeSitterEngine {
25 pub fn parse_file(
27 source: &str,
28 file_path: &str,
29 relative_path: &str,
30 language: &str,
31 chunker: &dyn LanguageChunker,
32 ) -> Result<Vec<CodeChunk>> {
33 let mut parser = ::tree_sitter::Parser::new();
34 let ts_language = chunker.tree_sitter_language();
35 parser
36 .set_language(&ts_language)
37 .context("Failed to set tree-sitter language")?;
38
39 let tree = parser
40 .parse(source.as_bytes(), None)
41 .context("Failed to parse source")?;
42
43 let root = tree.root_node();
44 let mut raw_chunks = Vec::new();
45
46 Self::traverse_node(root, source, chunker, None, &mut raw_chunks);
47
48 if raw_chunks.is_empty() && !source.trim().is_empty() {
50 let line_count = source.lines().count();
51 raw_chunks.push(TreeSitterChunk {
52 content: source.to_string(),
53 start_line: 1,
54 end_line: line_count,
55 node_type: "module".to_string(),
56 chunk_type: ChunkType::ModuleLevel,
57 name: Path::new(relative_path)
58 .file_stem()
59 .and_then(|s| s.to_str())
60 .map(String::from),
61 parent_name: None,
62 docstring: None,
63 decorators: Vec::new(),
64 is_async: false,
65 });
66 }
67
68 let folder_structure = CodeChunk::extract_folder_structure(relative_path);
70 let chunks = raw_chunks
71 .into_iter()
72 .map(|raw| {
73 let id = CodeChunk::generate_id(file_path, raw.start_line, raw.end_line, raw.name.as_deref());
74 let tags = CodeChunk::extract_tags(&raw.content, &raw.chunk_type);
75 let complexity_score = CodeChunk::compute_complexity(&raw.content);
76 let imports = Self::extract_imports(&raw.content, language);
77
78 CodeChunk {
79 id,
80 content: raw.content,
81 chunk_type: raw.chunk_type,
82 start_line: raw.start_line,
83 end_line: raw.end_line,
84 file_path: file_path.to_string(),
85 relative_path: relative_path.to_string(),
86 folder_structure: folder_structure.clone(),
87 name: raw.name,
88 parent_name: raw.parent_name,
89 language: language.to_string(),
90 docstring: raw.docstring,
91 decorators: raw.decorators,
92 imports,
93 tags,
94 complexity_score,
95 }
96 })
97 .collect();
98
99 Ok(chunks)
100 }
101
102 fn traverse_node(
104 node: ::tree_sitter::Node,
105 source: &str,
106 chunker: &dyn LanguageChunker,
107 parent_name: Option<&str>,
108 chunks: &mut Vec<TreeSitterChunk>,
109 ) {
110 let node_type = node.kind();
111
112 if chunker.is_splittable(node_type) {
113 let start_byte = node.start_byte();
115 let end_byte = node.end_byte();
116 let content = &source[start_byte..end_byte];
117 let start_line = node.start_position().row + 1;
118 let end_line = node.end_position().row + 1;
119
120 let metadata = chunker.extract_metadata(node, source);
121 let chunk_type = chunker.classify_node(node_type, parent_name);
122
123 let chunk = TreeSitterChunk {
124 content: content.to_string(),
125 start_line,
126 end_line,
127 node_type: node_type.to_string(),
128 chunk_type,
129 name: metadata.name.clone(),
130 parent_name: parent_name.map(String::from),
131 docstring: metadata.docstring,
132 decorators: metadata.decorators,
133 is_async: metadata.is_async,
134 };
135
136 chunks.push(chunk);
137
138 let current_name = metadata.name.as_deref().or(parent_name);
140 if chunker.has_nested_chunks(node_type) {
141 let mut cursor = node.walk();
142 for child in node.children(&mut cursor) {
143 Self::traverse_node(child, source, chunker, current_name, chunks);
144 }
145 }
146 } else {
147 let mut cursor = node.walk();
149 for child in node.children(&mut cursor) {
150 Self::traverse_node(child, source, chunker, parent_name, chunks);
151 }
152 }
153 }
154
155 fn extract_imports(content: &str, language: &str) -> Vec<String> {
157 let mut imports = Vec::new();
158 for line in content.lines() {
159 let trimmed = line.trim();
160 let is_import = match language {
161 "python" => trimmed.starts_with("import ") || trimmed.starts_with("from "),
162 "javascript" | "typescript" | "tsx" | "jsx" => {
163 trimmed.starts_with("import ") || trimmed.starts_with("require(")
164 }
165 "rust" => trimmed.starts_with("use ") || trimmed.starts_with("extern crate"),
166 "go" => trimmed.starts_with("import "),
167 "java" | "csharp" => trimmed.starts_with("import ") || trimmed.starts_with("using "),
168 "c" | "cpp" => trimmed.starts_with("#include"),
169 _ => false,
170 };
171 if is_import {
172 imports.push(trimmed.to_string());
173 }
174 }
175 imports
176 }
177}