codex_memory/
chunking.rs

1use crate::error::Result;
2
3/// A simple content chunk with byte boundaries
4#[derive(Debug, Clone)]
5pub struct ContentChunk {
6    pub content: String,
7    pub start_byte: usize,
8    pub end_byte: usize,
9    pub chunk_index: usize,
10}
11
12/// Simple file chunker for breaking large files into manageable pieces
13pub struct FileChunker {
14    chunk_size: usize,
15    overlap_size: usize,
16}
17
18impl FileChunker {
19    /// Create a new file chunker with specified chunk and overlap sizes
20    pub fn new(chunk_size: usize, overlap_size: usize) -> Self {
21        Self {
22            chunk_size,
23            overlap_size,
24        }
25    }
26
27    /// Create a default chunker with 8KB chunks and 200 byte overlap
28    pub fn with_defaults() -> Self {
29        Self::new(8192, 200)
30    }
31
32    /// Chunk content into overlapping pieces
33    pub fn chunk_content(&self, content: &str) -> Result<Vec<ContentChunk>> {
34        let mut chunks = Vec::new();
35        let bytes = content.as_bytes();
36        let mut start = 0;
37        let mut chunk_index = 0;
38
39        while start < bytes.len() {
40            // Calculate end position
41            let end = (start + self.chunk_size).min(bytes.len());
42
43            // Find a safe UTF-8 boundary
44            let safe_end = if end < bytes.len() {
45                // Look for the last valid UTF-8 character boundary
46                let mut boundary = end;
47                while boundary > start && !content.is_char_boundary(boundary) {
48                    boundary -= 1;
49                }
50                boundary
51            } else {
52                end
53            };
54
55            // Extract chunk content
56            let chunk_content = String::from_utf8_lossy(&bytes[start..safe_end]).to_string();
57
58            chunks.push(ContentChunk {
59                content: chunk_content,
60                start_byte: start,
61                end_byte: safe_end,
62                chunk_index,
63            });
64
65            // Move to next chunk with overlap
66            if safe_end >= bytes.len() {
67                break;
68            }
69
70            // Calculate overlap start
71            let overlap_start = if safe_end > self.overlap_size {
72                safe_end - self.overlap_size
73            } else {
74                safe_end
75            };
76
77            // Find safe UTF-8 boundary for overlap
78            let mut safe_overlap_start = overlap_start;
79            while safe_overlap_start < safe_end && !content.is_char_boundary(safe_overlap_start) {
80                safe_overlap_start += 1;
81            }
82
83            start = safe_overlap_start;
84            chunk_index += 1;
85        }
86
87        Ok(chunks)
88    }
89}
90
91#[cfg(test)]
92mod tests {
93    use super::*;
94
95    #[test]
96    fn test_basic_chunking() {
97        let chunker = FileChunker::new(100, 20);
98        let content = "a".repeat(250);
99        let chunks = chunker.chunk_content(&content).unwrap();
100
101        assert!(chunks.len() >= 3);
102        assert_eq!(chunks[0].chunk_index, 0);
103        assert_eq!(chunks[1].chunk_index, 1);
104    }
105
106    #[test]
107    fn test_utf8_boundary_safety() {
108        let chunker = FileChunker::new(10, 2);
109        let content = "Hello 世界 World";
110        let chunks = chunker.chunk_content(&content).unwrap();
111
112        // Ensure all chunks are valid UTF-8
113        for chunk in chunks {
114            assert!(
115                chunk.content.is_ascii()
116                    || chunk
117                        .content
118                        .chars()
119                        .all(|c| c.is_alphabetic() || c.is_whitespace())
120            );
121        }
122    }
123
124    #[test]
125    fn test_overlap() {
126        let chunker = FileChunker::new(50, 10);
127        let content = "a".repeat(100);
128        let chunks = chunker.chunk_content(&content).unwrap();
129
130        // Check that chunks have overlap
131        if chunks.len() > 1 {
132            let overlap_start = chunks[1].start_byte;
133            let first_end = chunks[0].end_byte;
134            assert!(overlap_start < first_end);
135        }
136    }
137}