1use anyhow::Result;
2use ck_core::Span;
3use serde::{Deserialize, Serialize};
4
5#[derive(Debug, Clone, Serialize, Deserialize)]
6pub struct Chunk {
7 pub span: Span,
8 pub text: String,
9 pub chunk_type: ChunkType,
10}
11
12#[derive(Debug, Clone, Serialize, Deserialize)]
13pub enum ChunkType {
14 Text,
15 Function,
16 Class,
17 Method,
18 Module,
19}
20
21pub fn chunk_text(text: &str, language: Option<&str>) -> Result<Vec<Chunk>> {
22 tracing::debug!("Chunking text with language: {:?}, length: {} chars", language, text.len());
23
24 let result = match language {
25 Some("python") => {
26 tracing::debug!("Using Python tree-sitter parser");
27 chunk_python(text)
28 },
29 Some("typescript") | Some("javascript") => {
30 tracing::debug!("Using TypeScript/JavaScript tree-sitter parser");
31 chunk_typescript(text)
32 },
33 Some("haskell") => {
34 tracing::debug!("Using Haskell tree-sitter parser");
35 chunk_haskell(text)
36 },
37 _ => {
38 tracing::debug!("Using generic chunking strategy");
39 chunk_generic(text)
40 },
41 };
42
43 match &result {
44 Ok(chunks) => tracing::debug!("Successfully created {} chunks", chunks.len()),
45 Err(e) => tracing::warn!("Chunking failed: {}", e),
46 }
47
48 result
49}
50
51fn chunk_generic(text: &str) -> Result<Vec<Chunk>> {
52 let mut chunks = Vec::new();
53 let lines: Vec<&str> = text.lines().collect();
54 let chunk_size = 20;
55 let overlap = 5;
56
57 let mut line_byte_offsets = Vec::with_capacity(lines.len() + 1);
59 line_byte_offsets.push(0);
60 let mut cumulative_offset = 0;
61 for line in &lines {
62 cumulative_offset += line.len() + 1; line_byte_offsets.push(cumulative_offset);
64 }
65
66 let mut i = 0;
67 while i < lines.len() {
68 let end = (i + chunk_size).min(lines.len());
69 let chunk_lines = &lines[i..end];
70 let chunk_text = chunk_lines.join("\n");
71
72 let byte_start = line_byte_offsets[i];
73 let byte_end = byte_start + chunk_text.len();
74
75 chunks.push(Chunk {
76 span: Span {
77 byte_start,
78 byte_end,
79 line_start: i + 1,
80 line_end: end,
81 },
82 text: chunk_text,
83 chunk_type: ChunkType::Text,
84 });
85
86 i += chunk_size - overlap;
87 if i >= lines.len() {
88 break;
89 }
90 }
91
92 Ok(chunks)
93}
94
95fn chunk_python(text: &str) -> Result<Vec<Chunk>> {
96 let mut parser = tree_sitter::Parser::new();
97 parser.set_language(&tree_sitter_python::language())?;
98
99 let tree = parser.parse(text, None).ok_or_else(|| {
100 anyhow::anyhow!("Failed to parse Python code")
101 })?;
102
103 let mut chunks = Vec::new();
104 let mut cursor = tree.root_node().walk();
105
106 extract_code_chunks(&mut cursor, text, &mut chunks, "python");
107
108 if chunks.is_empty() {
109 return chunk_generic(text);
110 }
111
112 Ok(chunks)
113}
114
115fn chunk_typescript(text: &str) -> Result<Vec<Chunk>> {
116 let mut parser = tree_sitter::Parser::new();
117 parser.set_language(&tree_sitter_typescript::language_typescript())?;
118
119 let tree = parser.parse(text, None).ok_or_else(|| {
120 anyhow::anyhow!("Failed to parse TypeScript code")
121 })?;
122
123 let mut chunks = Vec::new();
124 let mut cursor = tree.root_node().walk();
125
126 extract_code_chunks(&mut cursor, text, &mut chunks, "typescript");
127
128 if chunks.is_empty() {
129 return chunk_generic(text);
130 }
131
132 Ok(chunks)
133}
134
135fn chunk_haskell(text: &str) -> Result<Vec<Chunk>> {
136 let mut parser = tree_sitter::Parser::new();
137 parser.set_language(&tree_sitter_haskell::language())?;
138
139 let tree = parser.parse(text, None).ok_or_else(|| {
140 anyhow::anyhow!("Failed to parse Haskell code")
141 })?;
142
143 let mut chunks = Vec::new();
144 let mut cursor = tree.root_node().walk();
145
146 extract_code_chunks(&mut cursor, text, &mut chunks, "haskell");
147
148 if chunks.is_empty() {
149 return chunk_generic(text);
150 }
151
152 Ok(chunks)
153}
154
155fn extract_code_chunks(
156 cursor: &mut tree_sitter::TreeCursor,
157 source: &str,
158 chunks: &mut Vec<Chunk>,
159 language: &str,
160) {
161 let node = cursor.node();
162 let node_kind = node.kind();
163
164
165 let is_chunk = match language {
166 "python" => matches!(node_kind, "function_definition" | "class_definition"),
167 "typescript" | "javascript" => matches!(
168 node_kind,
169 "function_declaration" | "class_declaration" | "method_definition" | "arrow_function"
170 ),
171 "haskell" => matches!(
172 node_kind,
173 "signature" | "data_type" | "newtype" | "type_synomym" | "type_family" | "class" | "instance"
174 ),
175 _ => false,
176 };
177
178 if is_chunk {
179 let start_byte = node.start_byte();
180 let end_byte = node.end_byte();
181 let start_pos = node.start_position();
182 let end_pos = node.end_position();
183
184 let text = &source[start_byte..end_byte];
185
186 let chunk_type = match node_kind {
187 "function_definition" | "function_declaration" | "arrow_function" | "function" | "signature" => ChunkType::Function,
188 "class_definition" | "class_declaration" | "instance_declaration" | "class" | "instance" => ChunkType::Class,
189 "method_definition" => ChunkType::Method,
190 "data_type" | "newtype" | "type_synomym" | "type_family" => ChunkType::Module,
191 _ => ChunkType::Text,
192 };
193
194 chunks.push(Chunk {
195 span: Span {
196 byte_start: start_byte,
197 byte_end: end_byte,
198 line_start: start_pos.row + 1,
199 line_end: end_pos.row + 1,
200 },
201 text: text.to_string(),
202 chunk_type,
203 });
204 }
205
206 if cursor.goto_first_child() {
207 loop {
208 extract_code_chunks(cursor, source, chunks, language);
209 if !cursor.goto_next_sibling() {
210 break;
211 }
212 }
213 cursor.goto_parent();
214 }
215}
216
217#[cfg(test)]
218mod tests {
219 use super::*;
220
221 #[test]
222 fn test_chunk_generic_byte_offsets() {
223 let text = "line 1\nline 2\nline 3\nline 4\nline 5";
225 let chunks = chunk_generic(text).unwrap();
226
227 assert!(!chunks.is_empty());
228
229 assert_eq!(chunks[0].span.byte_start, 0);
231
232 for chunk in &chunks {
234 let expected_len = chunk.text.len();
235 let actual_len = chunk.span.byte_end - chunk.span.byte_start;
236 assert_eq!(actual_len, expected_len);
237 }
238 }
239
240 #[test]
241 fn test_chunk_generic_large_file_performance() {
242 let lines: Vec<String> = (0..1000).map(|i| format!("Line {}: Some content here", i)).collect();
244 let text = lines.join("\n");
245
246 let start = std::time::Instant::now();
247 let chunks = chunk_generic(&text).unwrap();
248 let duration = start.elapsed();
249
250 assert!(duration.as_millis() < 100, "Chunking took too long: {:?}", duration);
252 assert!(!chunks.is_empty());
253
254 for chunk in &chunks {
256 assert!(chunk.span.line_start > 0);
257 assert!(chunk.span.line_end >= chunk.span.line_start);
258 }
259 }
260}