Skip to main content

oxios_kernel/memory/
chunking.rs

1//! Text chunking utilities for memory content processing.
2//!
3//! Splits text into overlapping chunks for embedding generation.
4//! Supports both fixed-size and semantic (paragraph-based) chunking.
5
6/// Configuration for text chunking.
7#[derive(Debug, Clone)]
8pub struct ChunkConfig {
9    /// Maximum characters per chunk.
10    pub max_chunk_size: usize,
11    /// Overlap characters between consecutive chunks.
12    pub overlap: usize,
13    /// Minimum characters for a chunk (shorter segments are merged).
14    pub min_chunk_size: usize,
15}
16
17impl Default for ChunkConfig {
18    fn default() -> Self {
19        Self {
20            max_chunk_size: 512,
21            overlap: 64,
22            min_chunk_size: 50,
23        }
24    }
25}
26
27/// A text chunk with metadata.
28#[derive(Debug, Clone)]
29pub struct TextChunk {
30    /// The chunk text content.
31    pub text: String,
32    /// Start char offset in the original text.
33    pub start: usize,
34    /// End char offset in the original text.
35    pub end: usize,
36    /// Chunk index (0-based).
37    pub index: usize,
38}
39
40/// Split text into fixed-size overlapping chunks.
41///
42/// Chunks are created by sliding a window of `max_chunk_size` characters
43/// with `overlap` characters of overlap between consecutive chunks.
44pub fn chunk_fixed(text: &str, config: &ChunkConfig) -> Vec<TextChunk> {
45    if text.is_empty() {
46        return Vec::new();
47    }
48
49    let chars: Vec<char> = text.chars().collect();
50    let len = chars.len();
51
52    if len <= config.max_chunk_size {
53        return vec![TextChunk {
54            text: text.to_string(),
55            start: 0,
56            end: len,
57            index: 0,
58        }];
59    }
60
61    let mut chunks = Vec::new();
62    let step = config.max_chunk_size.saturating_sub(config.overlap);
63    let step = step.max(1); // ensure progress
64    let mut pos = 0;
65    let mut idx = 0;
66
67    while pos < len {
68        let end = (pos + config.max_chunk_size).min(len);
69        let chunk_text: String = chars[pos..end].iter().collect();
70
71        chunks.push(TextChunk {
72            text: chunk_text,
73            start: pos,
74            end,
75            index: idx,
76        });
77
78        pos += step;
79        idx += 1;
80
81        // If remaining is too small, include in last chunk
82        if pos < len && len - pos < config.min_chunk_size {
83            if let Some(last) = chunks.last_mut() {
84                let remaining: String = chars[pos..].iter().collect();
85                last.text.push_str(&remaining);
86                last.end = len;
87            }
88            break;
89        }
90    }
91
92    chunks
93}
94
95/// Split text into paragraphs, then group paragraphs into chunks
96/// that don't exceed `max_chunk_size`.
97pub fn chunk_paragraphs(text: &str, config: &ChunkConfig) -> Vec<TextChunk> {
98    if text.is_empty() {
99        return Vec::new();
100    }
101
102    // Split on double newlines (paragraph boundaries)
103    let paragraphs: Vec<&str> = text
104        .split("\n\n")
105        .map(|p| p.trim())
106        .filter(|p| !p.is_empty())
107        .collect();
108
109    if paragraphs.is_empty() {
110        return Vec::new();
111    }
112
113    let mut chunks = Vec::new();
114    let mut current_text = String::new();
115    let mut chunk_start = 0;
116    let mut idx = 0;
117
118    for para in &paragraphs {
119        if !current_text.is_empty() {
120            current_text.push_str("\n\n");
121        }
122
123        // If adding this paragraph exceeds max size, flush current chunk
124        if !current_text.is_empty() && current_text.len() + para.len() > config.max_chunk_size {
125            let end = chunk_start + current_text.len();
126            chunks.push(TextChunk {
127                text: current_text.clone(),
128                start: chunk_start,
129                end,
130                index: idx,
131            });
132            idx += 1;
133            chunk_start = end;
134            current_text.clear();
135        }
136
137        current_text.push_str(para);
138    }
139
140    // Flush remaining
141    if !current_text.is_empty() {
142        let len = current_text.len();
143        chunks.push(TextChunk {
144            text: current_text,
145            start: chunk_start,
146            end: chunk_start + len,
147            index: idx,
148        });
149    }
150
151    chunks
152}
153
154// ---------------------------------------------------------------------------
155// Tests
156// ---------------------------------------------------------------------------
157
158#[cfg(test)]
159mod tests {
160    use super::*;
161
162    #[test]
163    fn test_chunk_fixed_empty() {
164        let config = ChunkConfig::default();
165        let chunks = chunk_fixed("", &config);
166        assert!(chunks.is_empty());
167    }
168
169    #[test]
170    fn test_chunk_fixed_short_text() {
171        let config = ChunkConfig::default();
172        let chunks = chunk_fixed("hello world", &config);
173        assert_eq!(chunks.len(), 1);
174        assert_eq!(chunks[0].text, "hello world");
175    }
176
177    #[test]
178    fn test_chunk_fixed_long_text() {
179        let text = "abcdefghij".repeat(100); // 1000 chars
180        let config = ChunkConfig {
181            max_chunk_size: 200,
182            overlap: 20,
183            min_chunk_size: 50,
184        };
185        let chunks = chunk_fixed(&text, &config);
186
187        assert!(chunks.len() > 1);
188        for chunk in &chunks {
189            assert!(chunk.text.len() <= 250); // allow some slack for min_chunk merging
190        }
191
192        // Verify overlap: consecutive chunks share some prefix/suffix
193        if chunks.len() >= 2 {
194            let suffix: String = chunks[0]
195                .text
196                .chars()
197                .rev()
198                .take(20)
199                .collect::<Vec<_>>()
200                .into_iter()
201                .rev()
202                .collect();
203            let prefix: String = chunks[1].text.chars().take(20).collect();
204            assert_eq!(suffix, prefix, "Overlapping region should match");
205        }
206    }
207
208    #[test]
209    fn test_chunk_paragraphs_basic() {
210        let text = "First paragraph.\n\nSecond paragraph.\n\nThird paragraph.";
211        let config = ChunkConfig {
212            max_chunk_size: 100,
213            overlap: 0,
214            min_chunk_size: 10,
215        };
216        let chunks = chunk_paragraphs(text, &config);
217        assert_eq!(chunks.len(), 1); // all fit in one chunk
218        assert!(chunks[0].text.contains("First"));
219        assert!(chunks[0].text.contains("Third"));
220    }
221
222    #[test]
223    fn test_chunk_paragraphs_split() {
224        let para1 = "a".repeat(50);
225        let para2 = "b".repeat(50);
226        let para3 = "c".repeat(50);
227        let text = format!("{}\n\n{}\n\n{}", para1, para2, para3);
228
229        let config = ChunkConfig {
230            max_chunk_size: 80,
231            overlap: 0,
232            min_chunk_size: 10,
233        };
234        let chunks = chunk_paragraphs(&text, &config);
235        assert!(chunks.len() >= 2, "Should split into multiple chunks");
236    }
237
238    #[test]
239    fn test_chunk_fixed_indices() {
240        let text = "abcdefghij";
241        let config = ChunkConfig {
242            max_chunk_size: 5,
243            overlap: 2,
244            min_chunk_size: 1,
245        };
246        let chunks = chunk_fixed(text, &config);
247
248        // Verify indices form a coherent sequence
249        assert_eq!(chunks[0].start, 0);
250        for i in 1..chunks.len() {
251            assert!(chunks[i].start >= chunks[i - 1].start);
252        }
253    }
254
255    #[test]
256    fn test_chunk_default_config() {
257        let config = ChunkConfig::default();
258        assert_eq!(config.max_chunk_size, 512);
259        assert_eq!(config.overlap, 64);
260        assert_eq!(config.min_chunk_size, 50);
261    }
262}