Skip to main content

batuta/oracle/rag/
chunker.rs

1//! Semantic Chunker for Code-Aware Text Splitting
2//!
3//! Implements recursive character splitting with code-aware boundaries.
4//! Based on LangChain text splitter patterns [1] and Chen et al. (2017) [21].
5
6use super::fingerprint::ChunkerConfig;
7
8/// Semantic chunker with code-aware splitting
9///
10/// Uses recursive character splitting with Rust/Markdown-aware separators.
11/// Preserves context through configurable overlap.
12#[derive(Debug, Clone)]
13pub struct SemanticChunker {
14    /// Target chunk size in characters (approximates tokens)
15    chunk_size: usize,
16    /// Overlap between chunks for context preservation
17    chunk_overlap: usize,
18    /// Code-aware separators ordered by priority (highest to lowest)
19    separators: Vec<String>,
20}
21
22impl SemanticChunker {
23    /// Create a new semantic chunker with custom settings
24    pub fn new(chunk_size: usize, chunk_overlap: usize, separators: Vec<String>) -> Self {
25        Self { chunk_size, chunk_overlap, separators }
26    }
27
28    /// Create from a ChunkerConfig
29    pub fn from_config(config: &ChunkerConfig) -> Self {
30        Self {
31            chunk_size: config.chunk_size,
32            chunk_overlap: config.chunk_overlap,
33            separators: Self::default_separators(),
34        }
35    }
36
37    /// Default separators for Rust/Markdown content
38    fn default_separators() -> Vec<String> {
39        vec![
40            "\n## ".to_string(),     // Markdown H2
41            "\n### ".to_string(),    // Markdown H3
42            "\n#### ".to_string(),   // Markdown H4
43            "\nfn ".to_string(),     // Rust function
44            "\npub fn ".to_string(), // Rust public function
45            "\nimpl ".to_string(),   // Rust impl block
46            "\nstruct ".to_string(), // Rust struct
47            "\nenum ".to_string(),   // Rust enum
48            "\nmod ".to_string(),    // Rust module
49            "\n```".to_string(),     // Code fence
50            "\n\n".to_string(),      // Paragraph
51            "\n".to_string(),        // Line
52            " ".to_string(),         // Word
53        ]
54    }
55
56    /// Split text into chunks
57    pub fn split(&self, text: &str) -> Vec<Chunk> {
58        let mut chunks = Vec::new();
59        let mut current_pos = 0;
60
61        while current_pos < text.len() {
62            let (chunk_text, end_pos) = self.extract_chunk(text, current_pos);
63
64            if !chunk_text.trim().is_empty() {
65                let start_line = text[..current_pos].matches('\n').count() + 1;
66                let end_line = start_line + chunk_text.matches('\n').count();
67
68                chunks.push(Chunk {
69                    content: chunk_text.to_string(),
70                    start_offset: current_pos,
71                    end_offset: end_pos,
72                    start_line,
73                    end_line,
74                });
75            }
76
77            // Move forward, accounting for overlap
78            let advance = if end_pos - current_pos > self.chunk_overlap {
79                end_pos - current_pos - self.chunk_overlap
80            } else {
81                end_pos - current_pos
82            };
83
84            // Advance at least 1, ensuring we land on a char boundary
85            let new_pos = current_pos + advance.max(1);
86            current_pos = Self::find_next_char_boundary(text, new_pos);
87        }
88
89        chunks
90    }
91
92    /// Extract a single chunk starting at position
93    fn extract_chunk(&self, text: &str, start: usize) -> (String, usize) {
94        let remaining = &text[start..];
95        let target_end = Self::find_char_boundary(text, (start + self.chunk_size).min(text.len()));
96
97        // If remaining text fits in one chunk, return it all
98        if start + remaining.len() <= target_end {
99            return (remaining.to_string(), text.len());
100        }
101
102        // Find the best split point using separators
103        let search_region = &text[start..target_end];
104
105        for separator in &self.separators {
106            if let Some(pos) = search_region.rfind(separator.as_str()) {
107                if pos > 0 {
108                    // Include the separator in the chunk
109                    let end = start + pos + separator.len();
110                    return (text[start..end].to_string(), end);
111                }
112            }
113        }
114
115        // No separator found, hard cut at nearest char boundary
116        (text[start..target_end].to_string(), target_end)
117    }
118
119    /// Find the nearest valid UTF-8 character boundary at or before the given position
120    fn find_char_boundary(text: &str, pos: usize) -> usize {
121        if pos >= text.len() {
122            return text.len();
123        }
124        // Walk backwards to find a char boundary
125        let mut boundary = pos;
126        while boundary > 0 && !text.is_char_boundary(boundary) {
127            boundary -= 1;
128        }
129        boundary
130    }
131
132    /// Find the next valid UTF-8 character boundary at or after the given position
133    fn find_next_char_boundary(text: &str, pos: usize) -> usize {
134        if pos >= text.len() {
135            return text.len();
136        }
137        // Walk forwards to find a char boundary
138        let mut boundary = pos;
139        while boundary < text.len() && !text.is_char_boundary(boundary) {
140            boundary += 1;
141        }
142        boundary
143    }
144
145    /// Get configuration as ChunkerConfig
146    pub fn config(&self) -> ChunkerConfig {
147        let sep_refs: Vec<&str> = self.separators.iter().map(|s| s.as_str()).collect();
148        ChunkerConfig::new(self.chunk_size, self.chunk_overlap, &sep_refs)
149    }
150}
151
152impl Default for SemanticChunker {
153    fn default() -> Self {
154        Self { chunk_size: 512, chunk_overlap: 64, separators: Self::default_separators() }
155    }
156}
157
158/// A text chunk with position metadata
159#[derive(Debug, Clone, PartialEq, Eq)]
160pub struct Chunk {
161    /// Chunk content
162    pub content: String,
163    /// Start byte offset in source
164    pub start_offset: usize,
165    /// End byte offset in source
166    pub end_offset: usize,
167    /// Start line number (1-indexed)
168    pub start_line: usize,
169    /// End line number (1-indexed)
170    pub end_line: usize,
171}
172
173impl Chunk {
174    /// Get content hash for deduplication
175    pub fn content_hash(&self) -> [u8; 32] {
176        // Use same hash function as fingerprint
177        let mut hash = [0u8; 32];
178        let mut state: u64 = 0xcbf2_9ce4_8422_2325;
179        for &byte in self.content.as_bytes() {
180            state ^= byte as u64;
181            state = state.wrapping_mul(0x0100_0000_01b3);
182        }
183        for i in 0..4 {
184            let chunk = state.wrapping_add(i as u64).to_le_bytes();
185            hash[i * 8..(i + 1) * 8].copy_from_slice(&chunk);
186        }
187        hash
188    }
189}
190
191#[cfg(test)]
192mod tests {
193    use super::*;
194
195    #[test]
196    fn test_chunker_default() {
197        let chunker = SemanticChunker::default();
198        assert_eq!(chunker.chunk_size, 512);
199        assert_eq!(chunker.chunk_overlap, 64);
200        assert!(!chunker.separators.is_empty());
201    }
202
203    #[test]
204    fn test_split_short_text() {
205        let chunker = SemanticChunker::default();
206        let text = "Short text";
207        let chunks = chunker.split(text);
208
209        assert_eq!(chunks.len(), 1);
210        assert_eq!(chunks[0].content, "Short text");
211        assert_eq!(chunks[0].start_line, 1);
212        assert_eq!(chunks[0].end_line, 1);
213    }
214
215    #[test]
216    fn test_split_markdown_headers() {
217        let chunker = SemanticChunker::new(100, 10, vec!["\n## ".to_string()]);
218        let text = "# Title\n\nIntro paragraph.\n\n## Section 1\n\nContent 1.\n\n## Section 2\n\nContent 2.";
219
220        let chunks = chunker.split(text);
221
222        // Should split at ## headers
223        assert!(chunks.len() >= 2);
224    }
225
226    #[test]
227    fn test_split_rust_code() {
228        let chunker = SemanticChunker::new(200, 20, vec!["\nfn ".to_string(), "\n\n".to_string()]);
229        let text = r#"
230fn foo() {
231    println!("foo");
232}
233
234fn bar() {
235    println!("bar");
236}
237
238fn baz() {
239    println!("baz");
240}
241"#;
242
243        let chunks = chunker.split(text);
244
245        // Should preserve function boundaries
246        assert!(!chunks.is_empty());
247        for chunk in &chunks {
248            // Each chunk should be valid
249            assert!(!chunk.content.is_empty());
250        }
251    }
252
253    #[test]
254    fn test_chunk_overlap() {
255        let chunker = SemanticChunker::new(50, 10, vec![" ".to_string()]);
256        let text = "word1 word2 word3 word4 word5 word6 word7 word8 word9 word10 word11 word12";
257
258        let chunks = chunker.split(text);
259
260        // With overlap, subsequent chunks should share some content
261        if chunks.len() > 1 {
262            // Check that chunks have reasonable sizes
263            for chunk in &chunks {
264                assert!(chunk.content.len() <= chunker.chunk_size + 20); // Some tolerance
265            }
266        }
267    }
268
269    #[test]
270    fn test_chunk_line_tracking() {
271        let chunker = SemanticChunker::new(50, 5, vec!["\n".to_string()]);
272        let text = "Line 1\nLine 2\nLine 3\nLine 4\nLine 5";
273
274        let chunks = chunker.split(text);
275
276        // First chunk should start at line 1
277        assert_eq!(chunks[0].start_line, 1);
278    }
279
280    #[test]
281    fn test_chunk_content_hash_deterministic() {
282        let chunk1 = Chunk {
283            content: "test content".to_string(),
284            start_offset: 0,
285            end_offset: 12,
286            start_line: 1,
287            end_line: 1,
288        };
289        let chunk2 = Chunk {
290            content: "test content".to_string(),
291            start_offset: 100, // Different offset, same content
292            end_offset: 112,
293            start_line: 5,
294            end_line: 5,
295        };
296
297        assert_eq!(chunk1.content_hash(), chunk2.content_hash());
298    }
299
300    #[test]
301    fn test_chunk_content_hash_different() {
302        let chunk1 = Chunk {
303            content: "content 1".to_string(),
304            start_offset: 0,
305            end_offset: 9,
306            start_line: 1,
307            end_line: 1,
308        };
309        let chunk2 = Chunk {
310            content: "content 2".to_string(),
311            start_offset: 0,
312            end_offset: 9,
313            start_line: 1,
314            end_line: 1,
315        };
316
317        assert_ne!(chunk1.content_hash(), chunk2.content_hash());
318    }
319
320    #[test]
321    fn test_empty_text() {
322        let chunker = SemanticChunker::default();
323        let chunks = chunker.split("");
324        assert!(chunks.is_empty());
325    }
326
327    #[test]
328    fn test_whitespace_only() {
329        let chunker = SemanticChunker::default();
330        let chunks = chunker.split("   \n\n   \t   ");
331        assert!(chunks.is_empty());
332    }
333
334    #[test]
335    fn test_config_round_trip() {
336        let chunker = SemanticChunker::new(256, 32, vec!["\n".to_string()]);
337        let config = chunker.config();
338
339        assert_eq!(config.chunk_size, 256);
340        assert_eq!(config.chunk_overlap, 32);
341    }
342
343    #[test]
344    fn test_large_document_chunking() {
345        let chunker = SemanticChunker::new(100, 20, SemanticChunker::default_separators());
346
347        // Create a large document
348        let mut text = String::new();
349        for i in 0..50 {
350            text.push_str(&format!(
351                "\n## Section {}\n\nThis is paragraph {} with some content.\n",
352                i, i
353            ));
354        }
355
356        let chunks = chunker.split(&text);
357
358        // Should produce multiple chunks
359        assert!(chunks.len() > 1);
360
361        // All chunks should be non-empty
362        for chunk in &chunks {
363            assert!(!chunk.content.trim().is_empty());
364        }
365    }
366
367    // Property-based tests for semantic chunker
368    mod proptests {
369        use super::*;
370        use proptest::prelude::*;
371
372        // Strategy for generating reasonable chunk sizes
373        fn chunk_size_strategy() -> impl Strategy<Value = usize> {
374            32usize..=1024
375        }
376
377        // Strategy for generating reasonable overlap (must be less than chunk size)
378        fn overlap_strategy(chunk_size: usize) -> impl Strategy<Value = usize> {
379            0..=(chunk_size / 2)
380        }
381
382        proptest! {
383            #![proptest_config(ProptestConfig::with_cases(50))]
384
385            /// Property: Chunking any text produces chunks whose content equals the original
386            #[test]
387            fn prop_chunking_preserves_content(text in ".{0,500}") {
388                let chunker = SemanticChunker::default();
389                let chunks = chunker.split(&text);
390
391                // If there are chunks, they should contain content from the original
392                for chunk in &chunks {
393                    // Each chunk content should be found in original text (accounting for overlap)
394                    prop_assert!(text.contains(chunk.content.trim()) || chunk.content.trim().is_empty());
395                }
396            }
397
398            /// Property: All chunks have valid line numbers (start <= end)
399            #[test]
400            fn prop_chunk_lines_valid(text in ".{1,500}\n.{1,500}\n.{1,500}") {
401                let chunker = SemanticChunker::default();
402                let chunks = chunker.split(&text);
403
404                for chunk in &chunks {
405                    prop_assert!(chunk.start_line <= chunk.end_line,
406                        "start_line {} > end_line {}", chunk.start_line, chunk.end_line);
407                    prop_assert!(chunk.start_line >= 1,
408                        "start_line {} should be >= 1", chunk.start_line);
409                }
410            }
411
412            /// Property: Chunk offsets are valid (start <= end, within bounds)
413            #[test]
414            fn prop_chunk_offsets_valid(text in ".{10,500}") {
415                let chunker = SemanticChunker::default();
416                let chunks = chunker.split(&text);
417
418                for chunk in &chunks {
419                    prop_assert!(chunk.start_offset <= chunk.end_offset,
420                        "start_offset {} > end_offset {}", chunk.start_offset, chunk.end_offset);
421                    prop_assert!(chunk.end_offset <= text.len(),
422                        "end_offset {} > text.len() {}", chunk.end_offset, text.len());
423                }
424            }
425
426            /// Property: Content hashes are deterministic
427            #[test]
428            fn prop_content_hash_deterministic(content in ".{1,100}") {
429                let chunk = Chunk {
430                    content: content.clone(),
431                    start_offset: 0,
432                    end_offset: content.len(),
433                    start_line: 1,
434                    end_line: 1,
435                };
436
437                let hash1 = chunk.content_hash();
438                let hash2 = chunk.content_hash();
439                prop_assert_eq!(hash1, hash2);
440            }
441
442            /// Property: Different content produces different hashes (with high probability)
443            #[test]
444            fn prop_content_hash_different(
445                content1 in "[a-z]{5,20}",
446                content2 in "[A-Z]{5,20}"
447            ) {
448                // Ensure contents are different
449                if content1 != content2 {
450                    let chunk1 = Chunk {
451                        content: content1.clone(),
452                        start_offset: 0,
453                        end_offset: content1.len(),
454                        start_line: 1,
455                        end_line: 1,
456                    };
457                    let chunk2 = Chunk {
458                        content: content2.clone(),
459                        start_offset: 0,
460                        end_offset: content2.len(),
461                        start_line: 1,
462                        end_line: 1,
463                    };
464
465                    prop_assert_ne!(chunk1.content_hash(), chunk2.content_hash());
466                }
467            }
468
469            /// Property: Custom chunk sizes are respected (approximately)
470            #[test]
471            fn prop_chunk_size_respected(
472                chunk_size in chunk_size_strategy(),
473                text_len in 100usize..2000
474            ) {
475                let overlap = chunk_size / 4;
476                let chunker = SemanticChunker::new(chunk_size, overlap, vec![" ".to_string()]);
477
478                // Generate text of specified length
479                let text: String = (0..text_len).map(|i| if i % 10 == 0 { ' ' } else { 'a' }).collect();
480                let chunks = chunker.split(&text);
481
482                // Most chunks should be close to chunk_size (with some tolerance)
483                for chunk in &chunks {
484                    // Allow 2x tolerance for edge cases
485                    prop_assert!(chunk.content.len() <= chunk_size * 2,
486                        "chunk len {} > 2 * chunk_size {}", chunk.content.len(), chunk_size * 2);
487                }
488            }
489        }
490    }
491}