1use anyhow::Result;
2use ck_core::Span;
3use serde::{Deserialize, Serialize};
4
5#[derive(Debug, Clone, Serialize, Deserialize)]
6pub struct Chunk {
7 pub span: Span,
8 pub text: String,
9 pub chunk_type: ChunkType,
10}
11
12#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
13pub enum ChunkType {
14 Text,
15 Function,
16 Class,
17 Method,
18 Module,
19}
20
21pub fn chunk_text(text: &str, language: Option<&str>) -> Result<Vec<Chunk>> {
22 tracing::debug!("Chunking text with language: {:?}, length: {} chars", language, text.len());
23
24 let result = match language {
25 Some("python") => {
26 tracing::debug!("Using Python tree-sitter parser");
27 chunk_python(text)
28 },
29 Some("typescript") | Some("javascript") => {
30 tracing::debug!("Using TypeScript/JavaScript tree-sitter parser");
31 chunk_typescript(text)
32 },
33 Some("haskell") => {
34 tracing::debug!("Using Haskell tree-sitter parser");
35 chunk_haskell(text)
36 },
37 Some("rust") => {
38 tracing::debug!("Using Rust tree-sitter parser");
39 chunk_rust(text)
40 },
41 Some("ruby") => {
42 tracing::debug!("Using Ruby tree-sitter parser");
43 chunk_ruby(text)
44 },
45 _ => {
46 tracing::debug!("Using generic chunking strategy");
47 chunk_generic(text)
48 },
49 };
50
51 match &result {
52 Ok(chunks) => tracing::debug!("Successfully created {} chunks", chunks.len()),
53 Err(e) => tracing::warn!("Chunking failed: {}", e),
54 }
55
56 result
57}
58
59fn chunk_generic(text: &str) -> Result<Vec<Chunk>> {
60 let mut chunks = Vec::new();
61 let lines: Vec<&str> = text.lines().collect();
62 let chunk_size = 20;
63 let overlap = 5;
64
65 let mut line_byte_offsets = Vec::with_capacity(lines.len() + 1);
67 line_byte_offsets.push(0);
68 let mut cumulative_offset = 0;
69 for line in &lines {
70 cumulative_offset += line.len() + 1; line_byte_offsets.push(cumulative_offset);
72 }
73
74 let mut i = 0;
75 while i < lines.len() {
76 let end = (i + chunk_size).min(lines.len());
77 let chunk_lines = &lines[i..end];
78 let chunk_text = chunk_lines.join("\n");
79
80 let byte_start = line_byte_offsets[i];
81 let byte_end = byte_start + chunk_text.len();
82
83 chunks.push(Chunk {
84 span: Span {
85 byte_start,
86 byte_end,
87 line_start: i + 1,
88 line_end: end,
89 },
90 text: chunk_text,
91 chunk_type: ChunkType::Text,
92 });
93
94 i += chunk_size - overlap;
95 if i >= lines.len() {
96 break;
97 }
98 }
99
100 Ok(chunks)
101}
102
103fn chunk_python(text: &str) -> Result<Vec<Chunk>> {
104 let mut parser = tree_sitter::Parser::new();
105 parser.set_language(&tree_sitter_python::language())?;
106
107 let tree = parser.parse(text, None).ok_or_else(|| {
108 anyhow::anyhow!("Failed to parse Python code")
109 })?;
110
111 let mut chunks = Vec::new();
112 let mut cursor = tree.root_node().walk();
113
114 extract_code_chunks(&mut cursor, text, &mut chunks, "python");
115
116 if chunks.is_empty() {
117 return chunk_generic(text);
118 }
119
120 Ok(chunks)
121}
122
123fn chunk_typescript(text: &str) -> Result<Vec<Chunk>> {
124 let mut parser = tree_sitter::Parser::new();
125 parser.set_language(&tree_sitter_typescript::language_typescript())?;
126
127 let tree = parser.parse(text, None).ok_or_else(|| {
128 anyhow::anyhow!("Failed to parse TypeScript code")
129 })?;
130
131 let mut chunks = Vec::new();
132 let mut cursor = tree.root_node().walk();
133
134 extract_code_chunks(&mut cursor, text, &mut chunks, "typescript");
135
136 if chunks.is_empty() {
137 return chunk_generic(text);
138 }
139
140 Ok(chunks)
141}
142
143fn chunk_haskell(text: &str) -> Result<Vec<Chunk>> {
144 let mut parser = tree_sitter::Parser::new();
145 parser.set_language(&tree_sitter_haskell::language())?;
146
147 let tree = parser.parse(text, None).ok_or_else(|| {
148 anyhow::anyhow!("Failed to parse Haskell code")
149 })?;
150
151 let mut chunks = Vec::new();
152 let mut cursor = tree.root_node().walk();
153
154 extract_code_chunks(&mut cursor, text, &mut chunks, "haskell");
155
156 if chunks.is_empty() {
157 return chunk_generic(text);
158 }
159
160 Ok(chunks)
161}
162
163fn chunk_rust(text: &str) -> Result<Vec<Chunk>> {
164 let mut parser = tree_sitter::Parser::new();
165 parser.set_language(&tree_sitter_rust::language())?;
166
167 let tree = parser.parse(text, None).ok_or_else(|| {
168 anyhow::anyhow!("Failed to parse Rust code")
169 })?;
170
171 let mut chunks = Vec::new();
172 let mut cursor = tree.root_node().walk();
173
174 extract_code_chunks(&mut cursor, text, &mut chunks, "rust");
175
176 if chunks.is_empty() {
177 return chunk_generic(text);
178 }
179
180 Ok(chunks)
181}
182
183
184fn chunk_ruby(text: &str) -> Result<Vec<Chunk>> {
185 let mut parser = tree_sitter::Parser::new();
186 parser.set_language(&tree_sitter_ruby::language())?;
187
188 let tree = parser.parse(text, None).ok_or_else(|| {
189 anyhow::anyhow!("Failed to parse Ruby code")
190 })?;
191
192 let mut chunks = Vec::new();
193 let mut cursor = tree.root_node().walk();
194
195 extract_code_chunks(&mut cursor, text, &mut chunks, "ruby");
196
197 if chunks.is_empty() {
198 return chunk_generic(text);
199 }
200
201 Ok(chunks)
202}
203
204
205fn extract_code_chunks(
206 cursor: &mut tree_sitter::TreeCursor,
207 source: &str,
208 chunks: &mut Vec<Chunk>,
209 language: &str,
210) {
211 let node = cursor.node();
212 let node_kind = node.kind();
213
214
215 let is_chunk = match language {
216 "python" => matches!(node_kind, "function_definition" | "class_definition"),
217 "typescript" | "javascript" => matches!(
218 node_kind,
219 "function_declaration" | "class_declaration" | "method_definition" | "arrow_function"
220 ),
221 "haskell" => matches!(
222 node_kind,
223 "signature" | "data_type" | "newtype" | "type_synomym" | "type_family" | "class" | "instance"
224 ),
225 "rust" => matches!(
226 node_kind,
227 "function_item" | "impl_item" | "struct_item" | "enum_item" | "trait_item" | "mod_item"
228 ),
229 "ruby" => matches!(
230 node_kind,
231 "method" | "class" | "module" | "singleton_method"
232 ),
233 _ => false,
234 };
235
236 if is_chunk {
237 let start_byte = node.start_byte();
238 let end_byte = node.end_byte();
239 let start_pos = node.start_position();
240 let end_pos = node.end_position();
241
242 let text = &source[start_byte..end_byte];
243
244 let chunk_type = match node_kind {
245 "function_definition" | "function_declaration" | "arrow_function" | "function" | "signature" | "function_item" | "def" | "defp" | "method" | "singleton_method" | "defn" | "defn-" => ChunkType::Function,
246 "class_definition" | "class_declaration" | "instance_declaration" | "class" | "instance" | "struct_item" | "enum_item" | "defstruct" | "defrecord" | "deftype" => ChunkType::Class,
247 "method_definition" | "defmacro" => ChunkType::Method,
248 "data_type" | "newtype" | "type_synomym" | "type_family" | "impl_item" | "trait_item" | "mod_item" | "defmodule" | "module" | "defprotocol" | "ns" => ChunkType::Module,
249 _ => ChunkType::Text,
250 };
251
252 chunks.push(Chunk {
253 span: Span {
254 byte_start: start_byte,
255 byte_end: end_byte,
256 line_start: start_pos.row + 1,
257 line_end: end_pos.row + 1,
258 },
259 text: text.to_string(),
260 chunk_type,
261 });
262 }
263
264 if cursor.goto_first_child() {
265 loop {
266 extract_code_chunks(cursor, source, chunks, language);
267 if !cursor.goto_next_sibling() {
268 break;
269 }
270 }
271 cursor.goto_parent();
272 }
273}
274
275#[cfg(test)]
276mod tests {
277 use super::*;
278
279 #[test]
280 fn test_chunk_generic_byte_offsets() {
281 let text = "line 1\nline 2\nline 3\nline 4\nline 5";
283 let chunks = chunk_generic(text).unwrap();
284
285 assert!(!chunks.is_empty());
286
287 assert_eq!(chunks[0].span.byte_start, 0);
289
290 for chunk in &chunks {
292 let expected_len = chunk.text.len();
293 let actual_len = chunk.span.byte_end - chunk.span.byte_start;
294 assert_eq!(actual_len, expected_len);
295 }
296 }
297
298 #[test]
299 fn test_chunk_generic_large_file_performance() {
300 let lines: Vec<String> = (0..1000).map(|i| format!("Line {}: Some content here", i)).collect();
302 let text = lines.join("\n");
303
304 let start = std::time::Instant::now();
305 let chunks = chunk_generic(&text).unwrap();
306 let duration = start.elapsed();
307
308 assert!(duration.as_millis() < 100, "Chunking took too long: {:?}", duration);
310 assert!(!chunks.is_empty());
311
312 for chunk in &chunks {
314 assert!(chunk.span.line_start > 0);
315 assert!(chunk.span.line_end >= chunk.span.line_start);
316 }
317 }
318
319 #[test]
320 fn test_chunk_rust() {
321 let rust_code = r#"
322pub struct Calculator {
323 memory: f64,
324}
325
326impl Calculator {
327 pub fn new() -> Self {
328 Calculator { memory: 0.0 }
329 }
330
331 pub fn add(&mut self, a: f64, b: f64) -> f64 {
332 a + b
333 }
334}
335
336fn main() {
337 let calc = Calculator::new();
338}
339
340pub mod utils {
341 pub fn helper() {}
342}
343"#;
344
345 let chunks = chunk_rust(rust_code).unwrap();
346 assert!(!chunks.is_empty());
347
348 let chunk_types: Vec<&ChunkType> = chunks.iter().map(|c| &c.chunk_type).collect();
350 assert!(chunk_types.contains(&&ChunkType::Class)); assert!(chunk_types.contains(&&ChunkType::Module)); assert!(chunk_types.contains(&&ChunkType::Function)); }
354
355 #[test]
356 fn test_chunk_ruby() {
357 let ruby_code = r#"
358class Calculator
359 def initialize
360 @memory = 0.0
361 end
362
363 def add(a, b)
364 a + b
365 end
366
367 def self.class_method
368 "class method"
369 end
370
371 private
372
373 def private_method
374 "private"
375 end
376end
377
378module Utils
379 def self.helper
380 "helper"
381 end
382end
383
384def main
385 calc = Calculator.new
386end
387"#;
388
389 let chunks = chunk_ruby(ruby_code).unwrap();
390 assert!(!chunks.is_empty());
391
392 let chunk_types: Vec<&ChunkType> = chunks.iter().map(|c| &c.chunk_type).collect();
394 assert!(chunk_types.contains(&&ChunkType::Class)); assert!(chunk_types.contains(&&ChunkType::Module)); assert!(chunk_types.contains(&&ChunkType::Function)); }
398
399 #[test]
400 fn test_language_detection_fallback() {
401 let generic_text = "Some text\nwith multiple lines\nto chunk generically";
403
404 let chunks_unknown = chunk_text(generic_text, Some("unknown_language")).unwrap();
405 let chunks_generic = chunk_generic(generic_text).unwrap();
406
407 assert_eq!(chunks_unknown.len(), chunks_generic.len());
409 assert_eq!(chunks_unknown[0].text, chunks_generic[0].text);
410 }
411}