ck_chunk/
lib.rs

1use anyhow::Result;
2use ck_core::Span;
3use serde::{Deserialize, Serialize};
4
5#[derive(Debug, Clone, Serialize, Deserialize)]
6pub struct Chunk {
7    pub span: Span,
8    pub text: String,
9    pub chunk_type: ChunkType,
10}
11
12#[derive(Debug, Clone, Serialize, Deserialize)]
13pub enum ChunkType {
14    Text,
15    Function,
16    Class,
17    Method,
18    Module,
19}
20
21pub fn chunk_text(text: &str, language: Option<&str>) -> Result<Vec<Chunk>> {
22    tracing::debug!("Chunking text with language: {:?}, length: {} chars", language, text.len());
23    
24    let result = match language {
25        Some("python") => {
26            tracing::debug!("Using Python tree-sitter parser");
27            chunk_python(text)
28        },
29        Some("typescript") | Some("javascript") => {
30            tracing::debug!("Using TypeScript/JavaScript tree-sitter parser");
31            chunk_typescript(text)
32        },
33        Some("haskell") => {
34            tracing::debug!("Using Haskell tree-sitter parser");
35            chunk_haskell(text)
36        },
37        _ => {
38            tracing::debug!("Using generic chunking strategy");
39            chunk_generic(text)
40        },
41    };
42    
43    match &result {
44        Ok(chunks) => tracing::debug!("Successfully created {} chunks", chunks.len()),
45        Err(e) => tracing::warn!("Chunking failed: {}", e),
46    }
47    
48    result
49}
50
51fn chunk_generic(text: &str) -> Result<Vec<Chunk>> {
52    let mut chunks = Vec::new();
53    let lines: Vec<&str> = text.lines().collect();
54    let chunk_size = 20;
55    let overlap = 5;
56    
57    // Pre-compute cumulative byte offsets for O(1) lookup
58    let mut line_byte_offsets = Vec::with_capacity(lines.len() + 1);
59    line_byte_offsets.push(0);
60    let mut cumulative_offset = 0;
61    for line in &lines {
62        cumulative_offset += line.len() + 1; // +1 for newline
63        line_byte_offsets.push(cumulative_offset);
64    }
65    
66    let mut i = 0;
67    while i < lines.len() {
68        let end = (i + chunk_size).min(lines.len());
69        let chunk_lines = &lines[i..end];
70        let chunk_text = chunk_lines.join("\n");
71        
72        let byte_start = line_byte_offsets[i];
73        let byte_end = byte_start + chunk_text.len();
74        
75        chunks.push(Chunk {
76            span: Span {
77                byte_start,
78                byte_end,
79                line_start: i + 1,
80                line_end: end,
81            },
82            text: chunk_text,
83            chunk_type: ChunkType::Text,
84        });
85        
86        i += chunk_size - overlap;
87        if i >= lines.len() {
88            break;
89        }
90    }
91    
92    Ok(chunks)
93}
94
95fn chunk_python(text: &str) -> Result<Vec<Chunk>> {
96    let mut parser = tree_sitter::Parser::new();
97    parser.set_language(&tree_sitter_python::language())?;
98    
99    let tree = parser.parse(text, None).ok_or_else(|| {
100        anyhow::anyhow!("Failed to parse Python code")
101    })?;
102    
103    let mut chunks = Vec::new();
104    let mut cursor = tree.root_node().walk();
105    
106    extract_code_chunks(&mut cursor, text, &mut chunks, "python");
107    
108    if chunks.is_empty() {
109        return chunk_generic(text);
110    }
111    
112    Ok(chunks)
113}
114
115fn chunk_typescript(text: &str) -> Result<Vec<Chunk>> {
116    let mut parser = tree_sitter::Parser::new();
117    parser.set_language(&tree_sitter_typescript::language_typescript())?;
118    
119    let tree = parser.parse(text, None).ok_or_else(|| {
120        anyhow::anyhow!("Failed to parse TypeScript code")
121    })?;
122    
123    let mut chunks = Vec::new();
124    let mut cursor = tree.root_node().walk();
125    
126    extract_code_chunks(&mut cursor, text, &mut chunks, "typescript");
127    
128    if chunks.is_empty() {
129        return chunk_generic(text);
130    }
131    
132    Ok(chunks)
133}
134
135fn chunk_haskell(text: &str) -> Result<Vec<Chunk>> {
136    let mut parser = tree_sitter::Parser::new();
137    parser.set_language(&tree_sitter_haskell::language())?;
138    
139    let tree = parser.parse(text, None).ok_or_else(|| {
140        anyhow::anyhow!("Failed to parse Haskell code")
141    })?;
142    
143    let mut chunks = Vec::new();
144    let mut cursor = tree.root_node().walk();
145    
146    extract_code_chunks(&mut cursor, text, &mut chunks, "haskell");
147    
148    if chunks.is_empty() {
149        return chunk_generic(text);
150    }
151    
152    Ok(chunks)
153}
154
155fn extract_code_chunks(
156    cursor: &mut tree_sitter::TreeCursor,
157    source: &str,
158    chunks: &mut Vec<Chunk>,
159    language: &str,
160) {
161    let node = cursor.node();
162    let node_kind = node.kind();
163    
164    
165    let is_chunk = match language {
166        "python" => matches!(node_kind, "function_definition" | "class_definition"),
167        "typescript" | "javascript" => matches!(
168            node_kind,
169            "function_declaration" | "class_declaration" | "method_definition" | "arrow_function"
170        ),
171        "haskell" => matches!(
172            node_kind,
173            "signature" | "data_type" | "newtype" | "type_synomym" | "type_family" | "class" | "instance"
174        ),
175        _ => false,
176    };
177    
178    if is_chunk {
179        let start_byte = node.start_byte();
180        let end_byte = node.end_byte();
181        let start_pos = node.start_position();
182        let end_pos = node.end_position();
183        
184        let text = &source[start_byte..end_byte];
185        
186        let chunk_type = match node_kind {
187            "function_definition" | "function_declaration" | "arrow_function" | "function" | "signature" => ChunkType::Function,
188            "class_definition" | "class_declaration" | "instance_declaration" | "class" | "instance" => ChunkType::Class,
189            "method_definition" => ChunkType::Method,
190            "data_type" | "newtype" | "type_synomym" | "type_family" => ChunkType::Module,
191            _ => ChunkType::Text,
192        };
193        
194        chunks.push(Chunk {
195            span: Span {
196                byte_start: start_byte,
197                byte_end: end_byte,
198                line_start: start_pos.row + 1,
199                line_end: end_pos.row + 1,
200            },
201            text: text.to_string(),
202            chunk_type,
203        });
204    }
205    
206    if cursor.goto_first_child() {
207        loop {
208            extract_code_chunks(cursor, source, chunks, language);
209            if !cursor.goto_next_sibling() {
210                break;
211            }
212        }
213        cursor.goto_parent();
214    }
215}
216
217#[cfg(test)]
218mod tests {
219    use super::*;
220
221    #[test]
222    fn test_chunk_generic_byte_offsets() {
223        // Test that byte offsets are calculated correctly using O(n) algorithm
224        let text = "line 1\nline 2\nline 3\nline 4\nline 5";
225        let chunks = chunk_generic(text).unwrap();
226        
227        assert!(!chunks.is_empty());
228        
229        // First chunk should start at byte 0
230        assert_eq!(chunks[0].span.byte_start, 0);
231        
232        // Each chunk's byte_end should match the actual text length
233        for chunk in &chunks {
234            let expected_len = chunk.text.len();
235            let actual_len = chunk.span.byte_end - chunk.span.byte_start;
236            assert_eq!(actual_len, expected_len);
237        }
238    }
239
240    #[test]
241    fn test_chunk_generic_large_file_performance() {
242        // Create a large text to ensure O(n) performance
243        let lines: Vec<String> = (0..1000).map(|i| format!("Line {}: Some content here", i)).collect();
244        let text = lines.join("\n");
245        
246        let start = std::time::Instant::now();
247        let chunks = chunk_generic(&text).unwrap();
248        let duration = start.elapsed();
249        
250        // Should complete quickly even for 1000 lines
251        assert!(duration.as_millis() < 100, "Chunking took too long: {:?}", duration);
252        assert!(!chunks.is_empty());
253        
254        // Verify chunks have correct line numbers
255        for chunk in &chunks {
256            assert!(chunk.span.line_start > 0);
257            assert!(chunk.span.line_end >= chunk.span.line_start);
258        }
259    }
260}