oxios_kernel/memory/
chunking.rs1#[derive(Debug, Clone)]
8pub struct ChunkConfig {
9 pub max_chunk_size: usize,
11 pub overlap: usize,
13 pub min_chunk_size: usize,
15}
16
17impl Default for ChunkConfig {
18 fn default() -> Self {
19 Self {
20 max_chunk_size: 512,
21 overlap: 64,
22 min_chunk_size: 50,
23 }
24 }
25}
26
27#[derive(Debug, Clone)]
29pub struct TextChunk {
30 pub text: String,
32 pub start: usize,
34 pub end: usize,
36 pub index: usize,
38}
39
40pub fn chunk_fixed(text: &str, config: &ChunkConfig) -> Vec<TextChunk> {
45 if text.is_empty() {
46 return Vec::new();
47 }
48
49 let chars: Vec<char> = text.chars().collect();
50 let len = chars.len();
51
52 if len <= config.max_chunk_size {
53 return vec![TextChunk {
54 text: text.to_string(),
55 start: 0,
56 end: len,
57 index: 0,
58 }];
59 }
60
61 let mut chunks = Vec::new();
62 let step = config.max_chunk_size.saturating_sub(config.overlap);
63 let step = step.max(1); let mut pos = 0;
65 let mut idx = 0;
66
67 while pos < len {
68 let end = (pos + config.max_chunk_size).min(len);
69 let chunk_text: String = chars[pos..end].iter().collect();
70
71 chunks.push(TextChunk {
72 text: chunk_text,
73 start: pos,
74 end,
75 index: idx,
76 });
77
78 pos += step;
79 idx += 1;
80
81 if pos < len && len - pos < config.min_chunk_size {
83 if let Some(last) = chunks.last_mut() {
84 let remaining: String = chars[pos..].iter().collect();
85 last.text.push_str(&remaining);
86 last.end = len;
87 }
88 break;
89 }
90 }
91
92 chunks
93}
94
95pub fn chunk_paragraphs(text: &str, config: &ChunkConfig) -> Vec<TextChunk> {
98 if text.is_empty() {
99 return Vec::new();
100 }
101
102 let paragraphs: Vec<&str> = text
104 .split("\n\n")
105 .map(|p| p.trim())
106 .filter(|p| !p.is_empty())
107 .collect();
108
109 if paragraphs.is_empty() {
110 return Vec::new();
111 }
112
113 let mut chunks = Vec::new();
114 let mut current_text = String::new();
115 let mut chunk_start = 0;
116 let mut idx = 0;
117
118 for para in ¶graphs {
119 if !current_text.is_empty() {
120 current_text.push_str("\n\n");
121 }
122
123 if !current_text.is_empty() && current_text.len() + para.len() > config.max_chunk_size {
125 let end = chunk_start + current_text.len();
126 chunks.push(TextChunk {
127 text: current_text.clone(),
128 start: chunk_start,
129 end,
130 index: idx,
131 });
132 idx += 1;
133 chunk_start = end;
134 current_text.clear();
135 }
136
137 current_text.push_str(para);
138 }
139
140 if !current_text.is_empty() {
142 let len = current_text.len();
143 chunks.push(TextChunk {
144 text: current_text,
145 start: chunk_start,
146 end: chunk_start + len,
147 index: idx,
148 });
149 }
150
151 chunks
152}
153
154#[cfg(test)]
159mod tests {
160 use super::*;
161
162 #[test]
163 fn test_chunk_fixed_empty() {
164 let config = ChunkConfig::default();
165 let chunks = chunk_fixed("", &config);
166 assert!(chunks.is_empty());
167 }
168
169 #[test]
170 fn test_chunk_fixed_short_text() {
171 let config = ChunkConfig::default();
172 let chunks = chunk_fixed("hello world", &config);
173 assert_eq!(chunks.len(), 1);
174 assert_eq!(chunks[0].text, "hello world");
175 }
176
177 #[test]
178 fn test_chunk_fixed_long_text() {
179 let text = "abcdefghij".repeat(100); let config = ChunkConfig {
181 max_chunk_size: 200,
182 overlap: 20,
183 min_chunk_size: 50,
184 };
185 let chunks = chunk_fixed(&text, &config);
186
187 assert!(chunks.len() > 1);
188 for chunk in &chunks {
189 assert!(chunk.text.len() <= 250); }
191
192 if chunks.len() >= 2 {
194 let suffix: String = chunks[0]
195 .text
196 .chars()
197 .rev()
198 .take(20)
199 .collect::<Vec<_>>()
200 .into_iter()
201 .rev()
202 .collect();
203 let prefix: String = chunks[1].text.chars().take(20).collect();
204 assert_eq!(suffix, prefix, "Overlapping region should match");
205 }
206 }
207
208 #[test]
209 fn test_chunk_paragraphs_basic() {
210 let text = "First paragraph.\n\nSecond paragraph.\n\nThird paragraph.";
211 let config = ChunkConfig {
212 max_chunk_size: 100,
213 overlap: 0,
214 min_chunk_size: 10,
215 };
216 let chunks = chunk_paragraphs(text, &config);
217 assert_eq!(chunks.len(), 1); assert!(chunks[0].text.contains("First"));
219 assert!(chunks[0].text.contains("Third"));
220 }
221
222 #[test]
223 fn test_chunk_paragraphs_split() {
224 let para1 = "a".repeat(50);
225 let para2 = "b".repeat(50);
226 let para3 = "c".repeat(50);
227 let text = format!("{}\n\n{}\n\n{}", para1, para2, para3);
228
229 let config = ChunkConfig {
230 max_chunk_size: 80,
231 overlap: 0,
232 min_chunk_size: 10,
233 };
234 let chunks = chunk_paragraphs(&text, &config);
235 assert!(chunks.len() >= 2, "Should split into multiple chunks");
236 }
237
238 #[test]
239 fn test_chunk_fixed_indices() {
240 let text = "abcdefghij";
241 let config = ChunkConfig {
242 max_chunk_size: 5,
243 overlap: 2,
244 min_chunk_size: 1,
245 };
246 let chunks = chunk_fixed(text, &config);
247
248 assert_eq!(chunks[0].start, 0);
250 for i in 1..chunks.len() {
251 assert!(chunks[i].start >= chunks[i - 1].start);
252 }
253 }
254
255 #[test]
256 fn test_chunk_default_config() {
257 let config = ChunkConfig::default();
258 assert_eq!(config.max_chunk_size, 512);
259 assert_eq!(config.overlap, 64);
260 assert_eq!(config.min_chunk_size, 50);
261 }
262}