ck_chunk/
lib.rs

1use anyhow::Result;
2use ck_core::Span;
3use serde::{Deserialize, Serialize};
4
5#[derive(Debug, Clone, Serialize, Deserialize)]
6pub struct Chunk {
7    pub span: Span,
8    pub text: String,
9    pub chunk_type: ChunkType,
10}
11
12#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
13pub enum ChunkType {
14    Text,
15    Function,
16    Class,
17    Method,
18    Module,
19}
20
21pub fn chunk_text(text: &str, language: Option<&str>) -> Result<Vec<Chunk>> {
22    tracing::debug!("Chunking text with language: {:?}, length: {} chars", language, text.len());
23    
24    let result = match language {
25        Some("python") => {
26            tracing::debug!("Using Python tree-sitter parser");
27            chunk_python(text)
28        },
29        Some("typescript") | Some("javascript") => {
30            tracing::debug!("Using TypeScript/JavaScript tree-sitter parser");
31            chunk_typescript(text)
32        },
33        Some("haskell") => {
34            tracing::debug!("Using Haskell tree-sitter parser");
35            chunk_haskell(text)
36        },
37        Some("rust") => {
38            tracing::debug!("Using Rust tree-sitter parser");
39            chunk_rust(text)
40        },
41        Some("ruby") => {
42            tracing::debug!("Using Ruby tree-sitter parser");
43            chunk_ruby(text)
44        },
45        _ => {
46            tracing::debug!("Using generic chunking strategy");
47            chunk_generic(text)
48        },
49    };
50    
51    match &result {
52        Ok(chunks) => tracing::debug!("Successfully created {} chunks", chunks.len()),
53        Err(e) => tracing::warn!("Chunking failed: {}", e),
54    }
55    
56    result
57}
58
59fn chunk_generic(text: &str) -> Result<Vec<Chunk>> {
60    let mut chunks = Vec::new();
61    let lines: Vec<&str> = text.lines().collect();
62    let chunk_size = 20;
63    let overlap = 5;
64    
65    // Pre-compute cumulative byte offsets for O(1) lookup
66    let mut line_byte_offsets = Vec::with_capacity(lines.len() + 1);
67    line_byte_offsets.push(0);
68    let mut cumulative_offset = 0;
69    for line in &lines {
70        cumulative_offset += line.len() + 1; // +1 for newline
71        line_byte_offsets.push(cumulative_offset);
72    }
73    
74    let mut i = 0;
75    while i < lines.len() {
76        let end = (i + chunk_size).min(lines.len());
77        let chunk_lines = &lines[i..end];
78        let chunk_text = chunk_lines.join("\n");
79        
80        let byte_start = line_byte_offsets[i];
81        let byte_end = byte_start + chunk_text.len();
82        
83        chunks.push(Chunk {
84            span: Span {
85                byte_start,
86                byte_end,
87                line_start: i + 1,
88                line_end: end,
89            },
90            text: chunk_text,
91            chunk_type: ChunkType::Text,
92        });
93        
94        i += chunk_size - overlap;
95        if i >= lines.len() {
96            break;
97        }
98    }
99    
100    Ok(chunks)
101}
102
103fn chunk_python(text: &str) -> Result<Vec<Chunk>> {
104    let mut parser = tree_sitter::Parser::new();
105    parser.set_language(&tree_sitter_python::language())?;
106    
107    let tree = parser.parse(text, None).ok_or_else(|| {
108        anyhow::anyhow!("Failed to parse Python code")
109    })?;
110    
111    let mut chunks = Vec::new();
112    let mut cursor = tree.root_node().walk();
113    
114    extract_code_chunks(&mut cursor, text, &mut chunks, "python");
115    
116    if chunks.is_empty() {
117        return chunk_generic(text);
118    }
119    
120    Ok(chunks)
121}
122
123fn chunk_typescript(text: &str) -> Result<Vec<Chunk>> {
124    let mut parser = tree_sitter::Parser::new();
125    parser.set_language(&tree_sitter_typescript::language_typescript())?;
126    
127    let tree = parser.parse(text, None).ok_or_else(|| {
128        anyhow::anyhow!("Failed to parse TypeScript code")
129    })?;
130    
131    let mut chunks = Vec::new();
132    let mut cursor = tree.root_node().walk();
133    
134    extract_code_chunks(&mut cursor, text, &mut chunks, "typescript");
135    
136    if chunks.is_empty() {
137        return chunk_generic(text);
138    }
139    
140    Ok(chunks)
141}
142
143fn chunk_haskell(text: &str) -> Result<Vec<Chunk>> {
144    let mut parser = tree_sitter::Parser::new();
145    parser.set_language(&tree_sitter_haskell::language())?;
146    
147    let tree = parser.parse(text, None).ok_or_else(|| {
148        anyhow::anyhow!("Failed to parse Haskell code")
149    })?;
150    
151    let mut chunks = Vec::new();
152    let mut cursor = tree.root_node().walk();
153    
154    extract_code_chunks(&mut cursor, text, &mut chunks, "haskell");
155    
156    if chunks.is_empty() {
157        return chunk_generic(text);
158    }
159    
160    Ok(chunks)
161}
162
163fn chunk_rust(text: &str) -> Result<Vec<Chunk>> {
164    let mut parser = tree_sitter::Parser::new();
165    parser.set_language(&tree_sitter_rust::language())?;
166    
167    let tree = parser.parse(text, None).ok_or_else(|| {
168        anyhow::anyhow!("Failed to parse Rust code")
169    })?;
170    
171    let mut chunks = Vec::new();
172    let mut cursor = tree.root_node().walk();
173    
174    extract_code_chunks(&mut cursor, text, &mut chunks, "rust");
175    
176    if chunks.is_empty() {
177        return chunk_generic(text);
178    }
179    
180    Ok(chunks)
181}
182
183
184fn chunk_ruby(text: &str) -> Result<Vec<Chunk>> {
185    let mut parser = tree_sitter::Parser::new();
186    parser.set_language(&tree_sitter_ruby::language())?;
187    
188    let tree = parser.parse(text, None).ok_or_else(|| {
189        anyhow::anyhow!("Failed to parse Ruby code")
190    })?;
191    
192    let mut chunks = Vec::new();
193    let mut cursor = tree.root_node().walk();
194    
195    extract_code_chunks(&mut cursor, text, &mut chunks, "ruby");
196    
197    if chunks.is_empty() {
198        return chunk_generic(text);
199    }
200    
201    Ok(chunks)
202}
203
204
205fn extract_code_chunks(
206    cursor: &mut tree_sitter::TreeCursor,
207    source: &str,
208    chunks: &mut Vec<Chunk>,
209    language: &str,
210) {
211    let node = cursor.node();
212    let node_kind = node.kind();
213    
214    
215    let is_chunk = match language {
216        "python" => matches!(node_kind, "function_definition" | "class_definition"),
217        "typescript" | "javascript" => matches!(
218            node_kind,
219            "function_declaration" | "class_declaration" | "method_definition" | "arrow_function"
220        ),
221        "haskell" => matches!(
222            node_kind,
223            "signature" | "data_type" | "newtype" | "type_synomym" | "type_family" | "class" | "instance"
224        ),
225        "rust" => matches!(
226            node_kind,
227            "function_item" | "impl_item" | "struct_item" | "enum_item" | "trait_item" | "mod_item"
228        ),
229        "ruby" => matches!(
230            node_kind,
231            "method" | "class" | "module" | "singleton_method"
232        ),
233        _ => false,
234    };
235    
236    if is_chunk {
237        let start_byte = node.start_byte();
238        let end_byte = node.end_byte();
239        let start_pos = node.start_position();
240        let end_pos = node.end_position();
241        
242        let text = &source[start_byte..end_byte];
243        
244        let chunk_type = match node_kind {
245            "function_definition" | "function_declaration" | "arrow_function" | "function" | "signature" | "function_item" | "def" | "defp" | "method" | "singleton_method" | "defn" | "defn-" => ChunkType::Function,
246            "class_definition" | "class_declaration" | "instance_declaration" | "class" | "instance" | "struct_item" | "enum_item" | "defstruct" | "defrecord" | "deftype" => ChunkType::Class,
247            "method_definition" | "defmacro" => ChunkType::Method,
248            "data_type" | "newtype" | "type_synomym" | "type_family" | "impl_item" | "trait_item" | "mod_item" | "defmodule" | "module" | "defprotocol" | "ns" => ChunkType::Module,
249            _ => ChunkType::Text,
250        };
251        
252        chunks.push(Chunk {
253            span: Span {
254                byte_start: start_byte,
255                byte_end: end_byte,
256                line_start: start_pos.row + 1,
257                line_end: end_pos.row + 1,
258            },
259            text: text.to_string(),
260            chunk_type,
261        });
262    }
263    
264    if cursor.goto_first_child() {
265        loop {
266            extract_code_chunks(cursor, source, chunks, language);
267            if !cursor.goto_next_sibling() {
268                break;
269            }
270        }
271        cursor.goto_parent();
272    }
273}
274
275#[cfg(test)]
276mod tests {
277    use super::*;
278
279    #[test]
280    fn test_chunk_generic_byte_offsets() {
281        // Test that byte offsets are calculated correctly using O(n) algorithm
282        let text = "line 1\nline 2\nline 3\nline 4\nline 5";
283        let chunks = chunk_generic(text).unwrap();
284        
285        assert!(!chunks.is_empty());
286        
287        // First chunk should start at byte 0
288        assert_eq!(chunks[0].span.byte_start, 0);
289        
290        // Each chunk's byte_end should match the actual text length
291        for chunk in &chunks {
292            let expected_len = chunk.text.len();
293            let actual_len = chunk.span.byte_end - chunk.span.byte_start;
294            assert_eq!(actual_len, expected_len);
295        }
296    }
297
298    #[test]
299    fn test_chunk_generic_large_file_performance() {
300        // Create a large text to ensure O(n) performance
301        let lines: Vec<String> = (0..1000).map(|i| format!("Line {}: Some content here", i)).collect();
302        let text = lines.join("\n");
303        
304        let start = std::time::Instant::now();
305        let chunks = chunk_generic(&text).unwrap();
306        let duration = start.elapsed();
307        
308        // Should complete quickly even for 1000 lines
309        assert!(duration.as_millis() < 100, "Chunking took too long: {:?}", duration);
310        assert!(!chunks.is_empty());
311        
312        // Verify chunks have correct line numbers
313        for chunk in &chunks {
314            assert!(chunk.span.line_start > 0);
315            assert!(chunk.span.line_end >= chunk.span.line_start);
316        }
317    }
318
319    #[test]
320    fn test_chunk_rust() {
321        let rust_code = r#"
322pub struct Calculator {
323    memory: f64,
324}
325
326impl Calculator {
327    pub fn new() -> Self {
328        Calculator { memory: 0.0 }
329    }
330    
331    pub fn add(&mut self, a: f64, b: f64) -> f64 {
332        a + b
333    }
334}
335
336fn main() {
337    let calc = Calculator::new();
338}
339
340pub mod utils {
341    pub fn helper() {}
342}
343"#;
344        
345        let chunks = chunk_rust(rust_code).unwrap();
346        assert!(!chunks.is_empty());
347        
348        // Should find struct, impl, functions, and module
349        let chunk_types: Vec<&ChunkType> = chunks.iter().map(|c| &c.chunk_type).collect();
350        assert!(chunk_types.contains(&&ChunkType::Class));  // struct
351        assert!(chunk_types.contains(&&ChunkType::Module)); // impl and mod
352        assert!(chunk_types.contains(&&ChunkType::Function)); // functions
353    }
354
355    #[test]
356    fn test_chunk_ruby() {
357        let ruby_code = r#"
358class Calculator
359  def initialize
360    @memory = 0.0
361  end
362
363  def add(a, b)
364    a + b
365  end
366
367  def self.class_method
368    "class method"
369  end
370
371  private
372
373  def private_method
374    "private"
375  end
376end
377
378module Utils
379  def self.helper
380    "helper"
381  end
382end
383
384def main
385  calc = Calculator.new
386end
387"#;
388        
389        let chunks = chunk_ruby(ruby_code).unwrap();
390        assert!(!chunks.is_empty());
391        
392        // Should find class, module, and methods
393        let chunk_types: Vec<&ChunkType> = chunks.iter().map(|c| &c.chunk_type).collect();
394        assert!(chunk_types.contains(&&ChunkType::Class));    // class
395        assert!(chunk_types.contains(&&ChunkType::Module));   // module
396        assert!(chunk_types.contains(&&ChunkType::Function)); // methods
397    }
398
399    #[test]
400    fn test_language_detection_fallback() {
401        // Test that unknown languages fall back to generic chunking
402        let generic_text = "Some text\nwith multiple lines\nto chunk generically";
403        
404        let chunks_unknown = chunk_text(generic_text, Some("unknown_language")).unwrap();
405        let chunks_generic = chunk_generic(generic_text).unwrap();
406        
407        // Should produce the same result
408        assert_eq!(chunks_unknown.len(), chunks_generic.len());
409        assert_eq!(chunks_unknown[0].text, chunks_generic[0].text);
410    }
411}