codex_memory/
chunking.rs

1use crate::error::Result;
2
3/// A simple content chunk with byte boundaries
4#[derive(Debug, Clone)]
5pub struct ContentChunk {
6    pub content: String,
7    pub start_byte: usize,
8    pub end_byte: usize,
9    pub chunk_index: usize,
10}
11
12/// Simple file chunker for breaking large files into manageable pieces
13pub struct FileChunker {
14    chunk_size: usize,
15    overlap_size: usize,
16}
17
18impl FileChunker {
19    /// Create a new file chunker with specified chunk and overlap sizes
20    pub fn new(chunk_size: usize, overlap_size: usize) -> Self {
21        Self {
22            chunk_size,
23            overlap_size,
24        }
25    }
26
27    /// Create a default chunker with 8KB chunks and 200 byte overlap
28    pub fn with_defaults() -> Self {
29        Self::new(8192, 200)
30    }
31
32    /// Chunk content into overlapping pieces
33    pub fn chunk_content(&self, content: &str) -> Result<Vec<ContentChunk>> {
34        let mut chunks = Vec::new();
35        let bytes = content.as_bytes();
36        let mut start = 0;
37        let mut chunk_index = 0;
38
39        while start < bytes.len() {
40            // Calculate end position
41            let end = (start + self.chunk_size).min(bytes.len());
42
43            // Find a safe UTF-8 boundary
44            let safe_end = if end < bytes.len() {
45                // Look for the last valid UTF-8 character boundary
46                let mut boundary = end;
47                while boundary > start && !content.is_char_boundary(boundary) {
48                    boundary -= 1;
49                }
50                boundary
51            } else {
52                end
53            };
54
55            // Extract chunk content
56            let chunk_content = String::from_utf8_lossy(&bytes[start..safe_end]).to_string();
57
58            chunks.push(ContentChunk {
59                content: chunk_content,
60                start_byte: start,
61                end_byte: safe_end,
62                chunk_index,
63            });
64
65            // Move to next chunk with overlap
66            if safe_end >= bytes.len() {
67                break;
68            }
69
70            // Calculate overlap start
71            let overlap_start = if safe_end > self.overlap_size {
72                safe_end - self.overlap_size
73            } else {
74                safe_end
75            };
76
77            // Find safe UTF-8 boundary for overlap
78            let mut safe_overlap_start = overlap_start;
79            while safe_overlap_start < safe_end && !content.is_char_boundary(safe_overlap_start) {
80                safe_overlap_start += 1;
81            }
82
83            start = safe_overlap_start;
84            chunk_index += 1;
85        }
86
87        Ok(chunks)
88    }
89}
90
91#[cfg(test)]
92mod tests {
93    use super::*;
94
95    #[test]
96    #[allow(clippy::needless_borrow)]
97    fn test_basic_chunking() {
98        let chunker = FileChunker::new(100, 20);
99        let content = "a".repeat(250);
100        let chunks = chunker.chunk_content(&content).unwrap();
101
102        assert!(chunks.len() >= 3);
103        assert_eq!(chunks[0].chunk_index, 0);
104        assert_eq!(chunks[1].chunk_index, 1);
105    }
106
107    #[test]
108    #[allow(clippy::needless_borrow)]
109    fn test_utf8_boundary_safety() {
110        let chunker = FileChunker::new(10, 2);
111        let content = "Hello 世界 World";
112        let chunks = chunker.chunk_content(&content).unwrap();
113
114        // Ensure all chunks are valid UTF-8
115        for chunk in chunks {
116            assert!(
117                chunk.content.is_ascii()
118                    || chunk
119                        .content
120                        .chars()
121                        .all(|c| c.is_alphabetic() || c.is_whitespace())
122            );
123        }
124    }
125
126    #[test]
127    #[allow(clippy::needless_borrow)]
128    fn test_overlap() {
129        let chunker = FileChunker::new(50, 10);
130        let content = "a".repeat(100);
131        let chunks = chunker.chunk_content(&content).unwrap();
132
133        // Check that chunks have overlap
134        if chunks.len() > 1 {
135            let overlap_start = chunks[1].start_byte;
136            let first_end = chunks[0].end_byte;
137            assert!(overlap_start < first_end);
138        }
139    }
140}