codex_memory/
chunking.rs

1use crate::error::Result;
2use regex::Regex;
3
4/// A content chunk with semantic boundaries
5#[derive(Debug, Clone)]
6pub struct ContentChunk {
7    pub content: String,
8    pub start_byte: usize,
9    pub end_byte: usize,
10    pub chunk_index: usize,
11}
12
13/// Chunking strategy for semantic boundary preservation
14#[derive(Debug, Clone, Default)]
15pub enum ChunkingStrategy {
16    /// Split by sentences (preserves sentence boundaries)
17    Sentence,
18    /// Split by paragraphs (preserves paragraph boundaries)
19    Paragraph,
20    /// Semantic boundaries using NLP heuristics
21    Semantic,
22    /// Hybrid approach: size-based with semantic boundary adjustment
23    #[default]
24    Hybrid,
25}
26
27impl std::str::FromStr for ChunkingStrategy {
28    type Err = ();
29
30    fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
31        match s.to_lowercase().as_str() {
32            "sentence" => Ok(ChunkingStrategy::Sentence),
33            "paragraph" => Ok(ChunkingStrategy::Paragraph),
34            "semantic" => Ok(ChunkingStrategy::Semantic),
35            "hybrid" => Ok(ChunkingStrategy::Hybrid),
36            _ => Ok(ChunkingStrategy::Hybrid), // Default for unknown values
37        }
38    }
39}
40
41/// Advanced file chunker with semantic boundary preservation
42pub struct FileChunker {
43    chunk_size: usize,
44    overlap_size: usize,
45    strategy: ChunkingStrategy,
46}
47
48impl FileChunker {
49    /// Create a new file chunker with specified chunk and overlap sizes
50    pub fn new(chunk_size: usize, overlap_size: usize) -> Self {
51        Self {
52            chunk_size,
53            overlap_size,
54            strategy: ChunkingStrategy::default(),
55        }
56    }
57
58    /// Create a new file chunker with specified strategy
59    pub fn with_strategy(chunk_size: usize, overlap_size: usize, strategy: ChunkingStrategy) -> Self {
60        Self {
61            chunk_size,
62            overlap_size,
63            strategy,
64        }
65    }
66
67    /// Create a default chunker with 8KB chunks and 200 byte overlap
68    pub fn with_defaults() -> Self {
69        Self::new(8192, 200)
70    }
71
72    /// Chunk content into overlapping pieces using the configured strategy
73    pub fn chunk_content(&self, content: &str) -> Result<Vec<ContentChunk>> {
74        match self.strategy {
75            ChunkingStrategy::Sentence => self.chunk_by_sentences(content),
76            ChunkingStrategy::Paragraph => self.chunk_by_paragraphs(content),
77            ChunkingStrategy::Semantic => self.chunk_semantic(content),
78            ChunkingStrategy::Hybrid => self.chunk_hybrid(content),
79        }
80    }
81
82    /// Chunk content by sentence boundaries
83    fn chunk_by_sentences(&self, content: &str) -> Result<Vec<ContentChunk>> {
84        let sentence_regex = Regex::new(r"[.!?]+\s+").map_err(|e| {
85            crate::error::Error::InternalError(format!("Failed to create sentence regex: {}", e))
86        })?;
87
88        let sentences: Vec<&str> = sentence_regex
89            .split(content)
90            .filter(|s| !s.trim().is_empty())
91            .collect();
92
93        self.group_sentences_into_chunks(&sentences, content)
94    }
95
96    /// Chunk content by paragraph boundaries
97    fn chunk_by_paragraphs(&self, content: &str) -> Result<Vec<ContentChunk>> {
98        let paragraphs: Vec<&str> = content
99            .split("\n\n")
100            .filter(|p| !p.trim().is_empty())
101            .collect();
102
103        self.group_paragraphs_into_chunks(&paragraphs, content)
104    }
105
106    /// Semantic chunking using NLP heuristics
107    fn chunk_semantic(&self, content: &str) -> Result<Vec<ContentChunk>> {
108        // Use a combination of semantic boundaries:
109        // 1. Code blocks (```...```)
110        // 2. Headers (# ## ###)
111        // 3. List items
112        // 4. Paragraph breaks
113        
114        let semantic_boundaries = self.find_semantic_boundaries(content)?;
115        self.create_chunks_from_boundaries(content, &semantic_boundaries)
116    }
117
118    /// Hybrid approach: size-based with semantic boundary adjustment
119    fn chunk_hybrid(&self, content: &str) -> Result<Vec<ContentChunk>> {
120        let mut chunks = Vec::new();
121        let content_bytes = content.as_bytes();
122        let mut start = 0;
123        let mut chunk_index = 0;
124
125        while start < content_bytes.len() {
126            let initial_end = (start + self.chunk_size).min(content_bytes.len());
127            
128            // Find the best semantic boundary near the target size
129            let semantic_end = self.find_best_semantic_boundary(
130                content, 
131                start, 
132                initial_end, 
133                content_bytes.len()
134            );
135
136            // Extract chunk content
137            let chunk_content = content[start..semantic_end].to_string();
138
139            chunks.push(ContentChunk {
140                content: chunk_content,
141                start_byte: start,
142                end_byte: semantic_end,
143                chunk_index,
144            });
145
146            // Move to next chunk with overlap, but respect word boundaries
147            if semantic_end >= content_bytes.len() {
148                break;
149            }
150
151            start = self.calculate_semantic_overlap_start(content, semantic_end);
152            chunk_index += 1;
153        }
154
155        Ok(chunks)
156    }
157
158    /// Group sentences into chunks respecting size limits
159    fn group_sentences_into_chunks(&self, sentences: &[&str], _original: &str) -> Result<Vec<ContentChunk>> {
160        let mut chunks = Vec::new();
161        let mut current_chunk = String::new();
162        let mut chunk_start = 0;
163        let mut chunk_index = 0;
164
165        for sentence in sentences {
166            let potential_chunk = if current_chunk.is_empty() {
167                sentence.to_string()
168            } else {
169                format!("{} {}", current_chunk, sentence)
170            };
171
172            if potential_chunk.len() <= self.chunk_size || current_chunk.is_empty() {
173                current_chunk = potential_chunk;
174            } else {
175                // Finalize current chunk
176                let chunk_end = chunk_start + current_chunk.len();
177                chunks.push(ContentChunk {
178                    content: current_chunk.trim().to_string(),
179                    start_byte: chunk_start,
180                    end_byte: chunk_end,
181                    chunk_index,
182                });
183
184                // Start new chunk with overlap
185                let overlap_content = self.calculate_sentence_overlap(&current_chunk);
186                current_chunk = if overlap_content.is_empty() {
187                    sentence.to_string()
188                } else {
189                    format!("{} {}", overlap_content, sentence)
190                };
191                
192                chunk_start = chunk_end - overlap_content.len();
193                chunk_index += 1;
194            }
195        }
196
197        // Add the last chunk if not empty
198        if !current_chunk.trim().is_empty() {
199            let chunk_end = chunk_start + current_chunk.len();
200            chunks.push(ContentChunk {
201                content: current_chunk.trim().to_string(),
202                start_byte: chunk_start,
203                end_byte: chunk_end,
204                chunk_index,
205            });
206        }
207
208        Ok(chunks)
209    }
210
211    /// Group paragraphs into chunks respecting size limits
212    fn group_paragraphs_into_chunks(&self, paragraphs: &[&str], _original: &str) -> Result<Vec<ContentChunk>> {
213        let mut chunks = Vec::new();
214        let mut current_chunk = String::new();
215        let mut chunk_start = 0;
216        let mut chunk_index = 0;
217
218        for paragraph in paragraphs {
219            let potential_chunk = if current_chunk.is_empty() {
220                paragraph.to_string()
221            } else {
222                format!("{}\n\n{}", current_chunk, paragraph)
223            };
224
225            if potential_chunk.len() <= self.chunk_size || current_chunk.is_empty() {
226                current_chunk = potential_chunk;
227            } else {
228                // Finalize current chunk
229                let chunk_end = chunk_start + current_chunk.len();
230                chunks.push(ContentChunk {
231                    content: current_chunk.trim().to_string(),
232                    start_byte: chunk_start,
233                    end_byte: chunk_end,
234                    chunk_index,
235                });
236
237                // Start new chunk (no overlap for paragraph chunking)
238                current_chunk = paragraph.to_string();
239                chunk_start = chunk_end;
240                chunk_index += 1;
241            }
242        }
243
244        // Add the last chunk if not empty
245        if !current_chunk.trim().is_empty() {
246            let chunk_end = chunk_start + current_chunk.len();
247            chunks.push(ContentChunk {
248                content: current_chunk.trim().to_string(),
249                start_byte: chunk_start,
250                end_byte: chunk_end,
251                chunk_index,
252            });
253        }
254
255        Ok(chunks)
256    }
257
258    /// Find semantic boundaries in the text
259    fn find_semantic_boundaries(&self, content: &str) -> Result<Vec<usize>> {
260        let mut boundaries = vec![0]; // Start with the beginning
261        
262        // Find code blocks
263        let code_block_regex = Regex::new(r"```[\s\S]*?```").map_err(|e| {
264            crate::error::Error::InternalError(format!("Failed to create code block regex: {}", e))
265        })?;
266        
267        for mat in code_block_regex.find_iter(content) {
268            boundaries.push(mat.start());
269            boundaries.push(mat.end());
270        }
271        
272        // Find headers
273        let header_regex = Regex::new(r"(?m)^#{1,6}\s").map_err(|e| {
274            crate::error::Error::InternalError(format!("Failed to create header regex: {}", e))
275        })?;
276        
277        for mat in header_regex.find_iter(content) {
278            boundaries.push(mat.start());
279        }
280        
281        // Find paragraph breaks
282        let paragraph_regex = Regex::new(r"\n\s*\n").map_err(|e| {
283            crate::error::Error::InternalError(format!("Failed to create paragraph regex: {}", e))
284        })?;
285        
286        for mat in paragraph_regex.find_iter(content) {
287            boundaries.push(mat.end());
288        }
289        
290        boundaries.push(content.len()); // End with the content length
291        boundaries.sort_unstable();
292        boundaries.dedup();
293        
294        Ok(boundaries)
295    }
296
297    /// Create chunks from semantic boundaries
298    fn create_chunks_from_boundaries(&self, content: &str, boundaries: &[usize]) -> Result<Vec<ContentChunk>> {
299        let mut chunks = Vec::new();
300        let mut chunk_index = 0;
301        
302        for window in boundaries.windows(2) {
303            let start = window[0];
304            let end = window[1];
305            let chunk_content = content[start..end].trim();
306            
307            if !chunk_content.is_empty() && chunk_content.len() >= 10 {
308                chunks.push(ContentChunk {
309                    content: chunk_content.to_string(),
310                    start_byte: start,
311                    end_byte: end,
312                    chunk_index,
313                });
314                chunk_index += 1;
315            }
316        }
317        
318        Ok(chunks)
319    }
320
321    /// Find the best semantic boundary near the target position
322    fn find_best_semantic_boundary(&self, content: &str, start: usize, target_end: usize, content_len: usize) -> usize {
323        if target_end >= content_len {
324            return content_len;
325        }
326        
327        // Search window around target_end
328        let search_start = (target_end.saturating_sub(200)).max(start);
329        let search_end = (target_end + 200).min(content_len);
330        
331        let search_text = &content[search_start..search_end];
332        
333        // Look for good boundaries in order of preference:
334        // 1. Double newlines (paragraph breaks)
335        // 2. Single newlines
336        // 3. Sentence endings
337        // 4. Word boundaries
338        
339        let relative_target = target_end - search_start;
340        
341        // Paragraph breaks
342        if let Some(pos) = self.find_nearest_match(search_text, r"\n\s*\n", relative_target) {
343            return search_start + pos;
344        }
345        
346        // Single newlines
347        if let Some(pos) = self.find_nearest_match(search_text, r"\n", relative_target) {
348            return search_start + pos;
349        }
350        
351        // Sentence endings
352        if let Some(pos) = self.find_nearest_match(search_text, r"[.!?]\s+", relative_target) {
353            return search_start + pos;
354        }
355        
356        // Word boundaries
357        if let Some(pos) = self.find_nearest_match(search_text, r"\s+", relative_target) {
358            return search_start + pos;
359        }
360        
361        // Fallback to original target
362        target_end
363    }
364
365    /// Find the nearest regex match to a target position
366    fn find_nearest_match(&self, text: &str, pattern: &str, target: usize) -> Option<usize> {
367        let regex = Regex::new(pattern).ok()?;
368        let mut closest_pos = None;
369        let mut closest_distance = usize::MAX;
370        
371        for mat in regex.find_iter(text) {
372            let distance = if mat.end() > target {
373                mat.end() - target
374            } else {
375                target - mat.end()
376            };
377            
378            if distance < closest_distance {
379                closest_distance = distance;
380                closest_pos = Some(mat.end());
381            }
382        }
383        
384        closest_pos
385    }
386
387    /// Calculate semantic overlap start position
388    fn calculate_semantic_overlap_start(&self, content: &str, end: usize) -> usize {
389        let overlap_target = end.saturating_sub(self.overlap_size);
390        
391        // Find word boundary for overlap
392        let search_start = overlap_target.saturating_sub(50);
393        let search_end = end.min(search_start + 100);
394        
395        if search_start >= search_end {
396            return overlap_target;
397        }
398        
399        let search_text = &content[search_start..search_end];
400        let relative_target = overlap_target - search_start;
401        
402        if let Some(pos) = self.find_nearest_match(search_text, r"\s+", relative_target) {
403            search_start + pos
404        } else {
405            overlap_target
406        }
407    }
408
409    /// Calculate overlap for sentence-based chunking
410    fn calculate_sentence_overlap(&self, chunk: &str) -> String {
411        let words: Vec<&str> = chunk.split_whitespace().collect();
412        let overlap_words = (words.len() * self.overlap_size / chunk.len()).min(words.len() / 4);
413        
414        if overlap_words > 0 {
415            words[words.len().saturating_sub(overlap_words)..]
416                .join(" ")
417        } else {
418            String::new()
419        }
420    }
421}
422
423#[cfg(test)]
424mod tests {
425    use super::*;
426
427    #[test]
428    #[allow(clippy::needless_borrow)]
429    fn test_basic_chunking() {
430        let chunker = FileChunker::new(100, 20);
431        let content = "a".repeat(250);
432        let chunks = chunker.chunk_content(&content).unwrap();
433
434        assert!(chunks.len() >= 3);
435        assert_eq!(chunks[0].chunk_index, 0);
436        assert_eq!(chunks[1].chunk_index, 1);
437    }
438
439    #[test]
440    #[allow(clippy::needless_borrow)]
441    fn test_utf8_boundary_safety() {
442        let chunker = FileChunker::new(10, 2);
443        let content = "Hello 世界 World";
444        let chunks = chunker.chunk_content(&content).unwrap();
445
446        // Ensure all chunks are valid UTF-8
447        for chunk in chunks {
448            assert!(
449                chunk.content.is_ascii()
450                    || chunk
451                        .content
452                        .chars()
453                        .all(|c| c.is_alphabetic() || c.is_whitespace())
454            );
455        }
456    }
457
458    #[test]
459    #[allow(clippy::needless_borrow)]
460    fn test_overlap() {
461        let chunker = FileChunker::new(50, 10);
462        let content = "a".repeat(100);
463        let chunks = chunker.chunk_content(&content).unwrap();
464
465        // Check that chunks have overlap
466        if chunks.len() > 1 {
467            let overlap_start = chunks[1].start_byte;
468            let first_end = chunks[0].end_byte;
469            assert!(overlap_start < first_end);
470        }
471    }
472}