1pub trait Chunker: Send + Sync + 'static {
24 fn chunk(&self, text: &str) -> Vec<String>;
28}
29
30pub struct FixedSizeChunker {
47 chunk_size: usize,
48 overlap: usize,
49}
50
51impl FixedSizeChunker {
52 #[must_use]
58 pub fn new(chunk_size: usize, overlap: usize) -> Self {
59 assert!(
60 overlap < chunk_size,
61 "overlap ({overlap}) must be less than chunk_size ({chunk_size})"
62 );
63 Self {
64 chunk_size,
65 overlap,
66 }
67 }
68}
69
70impl Chunker for FixedSizeChunker {
71 fn chunk(&self, text: &str) -> Vec<String> {
72 if text.is_empty() {
73 return Vec::new();
74 }
75
76 let chars: Vec<char> = text.chars().collect();
77 let total = chars.len();
78
79 if total <= self.chunk_size {
80 return vec![text.to_string()];
81 }
82
83 let step = self.chunk_size - self.overlap;
84 let mut chunks = Vec::new();
85 let mut start = 0;
86
87 while start < total {
88 let end = (start + self.chunk_size).min(total);
89 let chunk: String = chars[start..end].iter().collect();
90 chunks.push(chunk);
91 if end == total {
92 break;
93 }
94 start += step;
95 }
96
97 chunks
98 }
99}
100
101pub struct SentenceChunker {
120 sentences_per_chunk: usize,
121}
122
123impl SentenceChunker {
124 #[must_use]
130 pub fn new(sentences_per_chunk: usize) -> Self {
131 assert!(sentences_per_chunk > 0, "sentences_per_chunk must be > 0");
132 Self {
133 sentences_per_chunk,
134 }
135 }
136
137 fn split_sentences(text: &str) -> Vec<String> {
139 let mut sentences = Vec::new();
140 let mut current = String::new();
141
142 for ch in text.chars() {
143 current.push(ch);
144 if matches!(ch, '.' | '!' | '?') {
145 let s = current.trim().to_string();
146 if !s.is_empty() {
147 sentences.push(s);
148 }
149 current.clear();
150 }
151 }
152
153 let remainder = current.trim().to_string();
155 if !remainder.is_empty() {
156 sentences.push(remainder);
157 }
158
159 sentences
160 }
161}
162
163impl Chunker for SentenceChunker {
164 fn chunk(&self, text: &str) -> Vec<String> {
165 if text.is_empty() {
166 return Vec::new();
167 }
168
169 let sentences = Self::split_sentences(text);
170 if sentences.is_empty() {
171 return Vec::new();
172 }
173
174 sentences
175 .chunks(self.sentences_per_chunk)
176 .map(|window| window.join(" "))
177 .collect()
178 }
179}
180
181pub struct RecursiveChunker {
202 max_chunk_size: usize,
203}
204
205impl RecursiveChunker {
206 #[must_use]
212 pub fn new(max_chunk_size: usize) -> Self {
213 assert!(max_chunk_size > 0, "max_chunk_size must be > 0");
214 Self { max_chunk_size }
215 }
216
217 fn split_by_level(text: &str, max: usize) -> Vec<String> {
218 let mut result = Vec::new();
219
220 for para in text.split("\n\n") {
222 let para = para.trim();
223 if para.is_empty() {
224 continue;
225 }
226
227 if para.chars().count() <= max {
228 result.push(para.to_string());
229 } else {
230 let mut sentence_buf = String::new();
232 for sentence in SentenceChunker::split_sentences(para) {
233 if sentence_buf.chars().count() + sentence.chars().count() + 1 <= max {
234 if !sentence_buf.is_empty() {
235 sentence_buf.push(' ');
236 }
237 sentence_buf.push_str(&sentence);
238 } else {
239 if !sentence_buf.is_empty() {
240 if sentence_buf.chars().count() <= max {
242 result.push(sentence_buf.clone());
243 } else {
244 let fixed = FixedSizeChunker::new(max, 0);
246 result.extend(fixed.chunk(&sentence_buf));
247 }
248 sentence_buf.clear();
249 }
250 sentence_buf = sentence;
252 }
253 }
254 if !sentence_buf.is_empty() {
255 if sentence_buf.chars().count() <= max {
256 result.push(sentence_buf);
257 } else {
258 let fixed = FixedSizeChunker::new(max, 0);
259 result.extend(fixed.chunk(&sentence_buf));
260 }
261 }
262 }
263 }
264
265 result
266 }
267}
268
269impl Chunker for RecursiveChunker {
270 fn chunk(&self, text: &str) -> Vec<String> {
271 if text.is_empty() {
272 return Vec::new();
273 }
274 Self::split_by_level(text, self.max_chunk_size)
275 }
276}
277
278#[cfg(test)]
283mod tests {
284 use super::*;
285
286 #[test]
289 fn test_fixed_size_empty_input() {
290 let c = FixedSizeChunker::new(200, 50);
292 assert!(c.chunk("").is_empty());
293 }
294
295 #[test]
296 fn test_fixed_size_short_text() {
297 let c = FixedSizeChunker::new(200, 50);
298 let chunks = c.chunk("short");
299 assert_eq!(chunks.len(), 1);
300 assert_eq!(chunks[0], "short");
301 }
302
303 #[test]
304 fn test_fixed_size_produces_overlap() {
305 let text = "a".repeat(1000);
307 let c = FixedSizeChunker::new(200, 50);
308 let chunks = c.chunk(&text);
309 assert!(
310 chunks.len() >= 5,
311 "expected >= 5 chunks, got {}",
312 chunks.len()
313 );
314
315 let end_of_first: String = chunks[0]
317 .chars()
318 .rev()
319 .take(50)
320 .collect::<String>()
321 .chars()
322 .rev()
323 .collect();
324 let start_of_second: String = chunks[1].chars().take(50).collect();
325 assert_eq!(end_of_first, start_of_second, "overlap not maintained");
326 }
327
328 #[test]
329 fn test_fixed_size_each_chunk_not_exceeds_size() {
330 let text = "x".repeat(500);
331 let c = FixedSizeChunker::new(100, 25);
332 for chunk in c.chunk(&text) {
333 assert!(chunk.chars().count() <= 100);
334 }
335 }
336
337 #[test]
340 fn test_sentence_chunker_empty() {
341 let c = SentenceChunker::new(3);
343 assert!(c.chunk("").is_empty());
344 }
345
346 #[test]
347 fn test_sentence_chunker_10_sentences_gives_4_chunks() {
348 let sents: Vec<String> = (1..=10).map(|i| format!("Sentence {i}.")).collect();
350 let text = sents.join(" ");
351 let c = SentenceChunker::new(3);
352 let chunks = c.chunk(&text);
353 assert_eq!(
355 chunks.len(),
356 4,
357 "expected 4 chunks, got {}: {:?}",
358 chunks.len(),
359 chunks
360 );
361 }
362
363 #[test]
364 fn test_sentence_chunker_single() {
365 let c = SentenceChunker::new(3);
366 let chunks = c.chunk("One sentence.");
367 assert_eq!(chunks.len(), 1);
368 }
369
370 #[test]
371 fn test_sentence_chunker_exclamation_question() {
372 let c = SentenceChunker::new(2);
373 let chunks = c.chunk("Hello! How are you? I'm fine.");
374 assert_eq!(chunks.len(), 2);
375 }
376
377 #[test]
380 fn test_recursive_chunker_empty() {
381 let c = RecursiveChunker::new(200);
383 assert!(c.chunk("").is_empty());
384 }
385
386 #[test]
387 fn test_recursive_chunker_paragraph_split() {
388 let text = "Short paragraph one.\n\nShort paragraph two.";
389 let c = RecursiveChunker::new(200);
390 let chunks = c.chunk(text);
391 assert_eq!(chunks.len(), 2);
392 }
393
394 #[test]
395 fn test_recursive_chunker_long_paragraph_splits_to_sentences() {
396 let long_sentence = "word ".repeat(20); let text = format!(
399 "{}. {}. {}.",
400 long_sentence.trim(),
401 long_sentence.trim(),
402 long_sentence.trim()
403 );
404 let c = RecursiveChunker::new(110); let chunks = c.chunk(&text);
406 assert!(
407 chunks.len() >= 2,
408 "expected multiple chunks for long paragraph"
409 );
410 }
411
412 #[test]
413 fn test_recursive_chunker_each_chunk_within_limit() {
414 let long_text = format!("word. {}", "sentence text here. ".repeat(50));
415 let c = RecursiveChunker::new(100);
416 for chunk in c.chunk(&long_text) {
417 assert!(
418 chunk.chars().count() <= 100,
419 "chunk exceeds max_chunk_size: {} chars",
420 chunk.chars().count()
421 );
422 }
423 }
424}