1#[derive(Debug, Clone, PartialEq, Eq)]
36pub struct ChunkConfig {
37 pub target_tokens: u32,
39 pub overlap_tokens: u32,
41}
42
43impl Default for ChunkConfig {
44 fn default() -> Self {
45 Self {
46 target_tokens: 500,
47 overlap_tokens: 50,
48 }
49 }
50}
51
52#[derive(Debug, Clone, PartialEq, Eq)]
60pub struct ChunkSpec {
61 pub content: String,
63 pub start_offset: u32,
65 pub end_offset: u32,
67 pub token_count: u32,
69}
70
71pub(crate) fn approx_token_count(text: &str) -> u32 {
73 let chars = text.chars().count();
74 u32::try_from(chars / 4).unwrap_or(u32::MAX)
76}
77
78pub fn chunk_text(text: &str, config: &ChunkConfig) -> Vec<ChunkSpec> {
92 if text.is_empty() {
93 return Vec::new();
94 }
95 let target = config.target_tokens.max(1);
96 let overlap = config.overlap_tokens.min(target.saturating_sub(1));
97 let total_tokens = approx_token_count(text);
99 if total_tokens <= target.saturating_mul(3) / 2 {
100 return vec![ChunkSpec {
101 content: text.to_string(),
102 start_offset: 0,
103 end_offset: u32::try_from(text.len()).unwrap_or(u32::MAX),
104 token_count: total_tokens,
105 }];
106 }
107
108 let paragraphs = split_paragraphs(text);
109 let oversize_threshold = target.saturating_mul(3) / 2;
110
111 let mut chunks: Vec<ChunkSpec> = Vec::new();
112 let mut cursor_start: usize = 0; let mut cursor_end: usize = 0; let mut cursor_tokens: u32 = 0;
115
116 for p in ¶graphs {
117 let p_tokens = approx_token_count(&text[p.start..p.end]);
118
119 if p_tokens >= oversize_threshold {
122 if cursor_end > cursor_start {
123 push_chunk(&mut chunks, text, cursor_start, cursor_end);
124 }
125 slide_window(&mut chunks, text, p.start, p.end, target, overlap);
126 cursor_start = window_overlap_start(text, p.end, overlap);
127 cursor_end = cursor_start;
128 cursor_tokens = 0;
129 continue;
130 }
131
132 if cursor_end > cursor_start && cursor_tokens + p_tokens > oversize_threshold {
136 push_chunk(&mut chunks, text, cursor_start, cursor_end);
137 cursor_start = window_overlap_start(text, cursor_end, overlap);
138 }
141
142 cursor_end = p.end;
144 cursor_tokens = approx_token_count(&text[cursor_start..cursor_end]);
145
146 if cursor_tokens >= target {
149 push_chunk(&mut chunks, text, cursor_start, cursor_end);
150 cursor_start = window_overlap_start(text, cursor_end, overlap);
151 cursor_end = cursor_start;
152 }
154 }
155
156 if cursor_end > cursor_start {
158 push_chunk(&mut chunks, text, cursor_start, cursor_end);
159 }
160
161 chunks
162}
163
164#[derive(Debug, Clone, Copy)]
166struct Paragraph {
167 start: usize,
168 end: usize,
172}
173
174fn split_paragraphs(text: &str) -> Vec<Paragraph> {
178 let bytes = text.as_bytes();
179 let n = bytes.len();
180 let mut out = Vec::new();
181 let mut start = 0usize;
182 let mut i = 0usize;
183 while i < n {
184 if i + 1 < n && bytes[i] == b'\n' && bytes[i + 1] == b'\n' {
186 let mut j = i + 2;
190 while j < n && bytes[j] == b'\n' {
191 j += 1;
192 }
193 out.push(Paragraph { start, end: j });
194 start = j;
195 i = j;
196 continue;
197 }
198 i += 1;
199 }
200 if start < n {
201 out.push(Paragraph { start, end: n });
202 }
203 out
204}
205
206fn window_overlap_start(text: &str, end: usize, overlap: u32) -> usize {
210 if overlap == 0 || end == 0 {
211 return end;
212 }
213 let target_chars = (overlap as usize) * 4;
214 let mut count = 0usize;
215 let prefix = &text[..end];
217 for (idx, _ch) in prefix.char_indices().rev() {
218 count += 1;
219 if count > target_chars {
220 return idx;
221 }
222 }
223 0
224}
225
226fn push_chunk(out: &mut Vec<ChunkSpec>, text: &str, start: usize, end: usize) {
227 debug_assert!(start < end, "push_chunk: empty range [{start},{end})");
228 debug_assert!(text.is_char_boundary(start), "start {start} not on char boundary");
229 debug_assert!(text.is_char_boundary(end), "end {end} not on char boundary");
230 let slice = &text[start..end];
231 out.push(ChunkSpec {
232 content: slice.to_string(),
233 start_offset: u32::try_from(start).unwrap_or(u32::MAX),
234 end_offset: u32::try_from(end).unwrap_or(u32::MAX),
235 token_count: approx_token_count(slice),
236 });
237}
238
239fn slide_window(
244 out: &mut Vec<ChunkSpec>,
245 text: &str,
246 range_start: usize,
247 range_end: usize,
248 target: u32,
249 overlap: u32,
250) {
251 let target_chars = (target as usize) * 4;
252 let mut window_start = range_start;
253 while window_start < range_end {
254 let mut chars_seen = 0usize;
257 let mut window_end = range_end;
258 let suffix = &text[window_start..range_end];
259 for (idx, _ch) in suffix.char_indices() {
260 chars_seen += 1;
261 if chars_seen >= target_chars {
262 window_end = window_start + idx;
263 break;
264 }
265 }
266 if window_end < range_end {
270 let lookback = target_chars / 10;
271 let snap = find_sentence_break(text, window_start, window_end, lookback);
272 window_end = snap;
273 }
274 if window_end <= window_start {
275 let next = text[window_start..range_end]
277 .char_indices()
278 .next()
279 .map(|(_, c)| window_start + c.len_utf8())
280 .unwrap_or(range_end);
281 window_end = next;
282 }
283 push_chunk(out, text, window_start, window_end);
284 if window_end >= range_end {
285 break;
286 }
287 let next_start = window_overlap_start(text, window_end, overlap);
288 window_start = next_start.max(window_start + 1);
290 while window_start < range_end && !text.is_char_boundary(window_start) {
292 window_start += 1;
293 }
294 }
295}
296
297fn find_sentence_break(
300 text: &str,
301 window_start: usize,
302 window_end: usize,
303 lookback_chars: usize,
304) -> usize {
305 let bytes = text.as_bytes();
306 let look_start = {
308 let prefix = &text[window_start..window_end];
309 let mut count = 0usize;
310 let mut start_idx = 0usize;
311 for (idx, _ch) in prefix.char_indices().rev() {
312 count += 1;
313 if count >= lookback_chars {
314 start_idx = window_start + idx;
315 break;
316 }
317 }
318 if count < lookback_chars {
319 window_start
320 } else {
321 start_idx
322 }
323 };
324 let mut i = window_end;
326 while i > look_start {
327 let prev = match text[..i].char_indices().next_back() {
328 Some((idx, _)) => idx,
329 None => break,
330 };
331 let ch = bytes[prev];
332 if ch == b'.' || ch == b'!' || ch == b'?' || ch == b'\n' {
333 return i; }
335 i = prev;
336 }
337 window_end
338}
339
340#[cfg(test)]
345mod tests {
346 use super::*;
347
348 #[test]
349 fn chunk_empty_text_returns_empty_vec() {
350 let out = chunk_text("", &ChunkConfig::default());
351 assert!(out.is_empty());
352 }
353
354 #[test]
355 fn chunk_short_text_returns_single_chunk() {
356 let text = "Hello world. This is a tiny doc.";
358 let out = chunk_text(text, &ChunkConfig::default());
359 assert_eq!(out.len(), 1);
360 assert_eq!(out[0].content, text);
361 assert_eq!(out[0].start_offset, 0);
362 assert_eq!(out[0].end_offset as usize, text.len());
363 }
364
365 fn synthetic_text(paragraph_count: usize, words_per_paragraph: usize) -> String {
370 let mut s = String::new();
371 for p in 0..paragraph_count {
372 for w in 0..words_per_paragraph {
373 if w > 0 {
374 s.push(' ');
375 }
376 s.push_str(&format!("word{w:02}"));
377 }
378 s.push('.');
379 if p + 1 < paragraph_count {
380 s.push_str("\n\n");
381 }
382 }
383 s
384 }
385
386 #[test]
387 fn chunk_long_text_splits_into_multiple() {
388 let text = synthetic_text(500, 8); let cfg = ChunkConfig::default();
392 let out = chunk_text(&text, &cfg);
393 assert!(out.len() > 1, "expected multiple chunks, got {}", out.len());
394 for c in &out {
396 let slice = &text[c.start_offset as usize..c.end_offset as usize];
397 assert_eq!(slice, c.content.as_str());
398 }
399 }
400
401 #[test]
402 fn chunk_respects_paragraph_boundaries() {
403 let text = synthetic_text(40, 6);
406 let cfg = ChunkConfig {
407 target_tokens: 50,
408 overlap_tokens: 5,
409 };
410 let out = chunk_text(&text, &cfg);
411 assert!(out.len() > 1);
412 for c in &out {
415 let last_char = c.content.chars().last().unwrap();
416 assert!(
421 last_char == '.' || last_char == '\n' || last_char.is_ascii_alphanumeric(),
422 "chunk ends mid-token at: {:?}",
423 &c.content[c.content.len().saturating_sub(20)..]
424 );
425 }
426 }
427
428 #[test]
429 fn chunk_target_size_band() {
430 let text = synthetic_text(300, 8);
432 let cfg = ChunkConfig {
433 target_tokens: 100,
434 overlap_tokens: 10,
435 };
436 let out = chunk_text(&text, &cfg);
437 assert!(out.len() >= 3, "need enough chunks to evaluate band");
438 let lower = cfg.target_tokens / 2;
440 let upper = cfg.target_tokens * 3 / 2;
441 for (i, c) in out.iter().enumerate().take(out.len() - 1) {
442 assert!(
443 c.token_count >= lower && c.token_count <= upper,
444 "chunk {i} out of band: token_count={} band=[{lower},{upper}]",
445 c.token_count,
446 );
447 }
448 }
449
450 #[test]
451 fn chunk_offsets_monotonic_with_overlap() {
452 let text = synthetic_text(200, 8);
453 let cfg = ChunkConfig {
454 target_tokens: 100,
455 overlap_tokens: 10,
456 };
457 let out = chunk_text(&text, &cfg);
458 assert!(out.len() >= 2);
459 for window in out.windows(2) {
460 let a = &window[0];
461 let b = &window[1];
462 assert!(
464 b.end_offset > a.end_offset,
465 "end_offset must increase across chunks: {} -> {}",
466 a.end_offset,
467 b.end_offset
468 );
469 assert!(
474 b.start_offset <= a.end_offset,
475 "next chunk should overlap or abut the previous: a.end={} b.start={}",
476 a.end_offset,
477 b.start_offset,
478 );
479 }
480 }
481
482 #[test]
483 fn chunk_utf8_safe_offsets() {
484 let para = "こんにちは世界。これは日本語のテストです。Caféの紅茶。".repeat(40);
487 let text = format!(
488 "{para}\n\n{}",
489 "Bonjour le monde. Voici un test en français.".repeat(40)
490 );
491 let cfg = ChunkConfig {
492 target_tokens: 80,
493 overlap_tokens: 10,
494 };
495 let out = chunk_text(&text, &cfg);
496 assert!(!out.is_empty());
497 for c in &out {
498 let s = c.start_offset as usize;
499 let e = c.end_offset as usize;
500 assert!(text.is_char_boundary(s), "start {s} not on char boundary");
501 assert!(text.is_char_boundary(e), "end {e} not on char boundary");
502 assert_eq!(&text[s..e], c.content);
504 }
505 }
506
507 #[test]
508 fn chunk_very_large_text() {
509 let text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. ".repeat(1_800);
511 assert!(text.len() > 100_000);
512 let cfg = ChunkConfig::default();
513 let start = std::time::Instant::now();
514 let out = chunk_text(&text, &cfg);
515 let elapsed = start.elapsed();
516 assert!(out.len() > 10, "expected many chunks, got {}", out.len());
517 assert!(
518 elapsed < std::time::Duration::from_secs(2),
519 "100KB chunking too slow: {elapsed:?}"
520 );
521 }
522
523 #[test]
524 fn chunk_token_count_approximation() {
525 assert_eq!(approx_token_count("0123456789".repeat(4).as_str()), 10);
527 assert_eq!(approx_token_count("あいうえおかきくけこさし"), 3);
529 assert_eq!(approx_token_count(""), 0);
531 }
532
533 #[test]
534 fn chunk_oversized_paragraph_slides_window() {
535 let sentence = "This is a sentence with several words in it. ".to_string();
538 let mega_paragraph = sentence.repeat(200); let cfg = ChunkConfig {
540 target_tokens: 100,
541 overlap_tokens: 10,
542 };
543 let out = chunk_text(&mega_paragraph, &cfg);
544 assert!(
545 out.len() >= 3,
546 "expected oversized paragraph to be split, got {} chunks",
547 out.len()
548 );
549 for c in &out {
551 let s = c.start_offset as usize;
552 let e = c.end_offset as usize;
553 assert!(mega_paragraph.is_char_boundary(s));
554 assert!(mega_paragraph.is_char_boundary(e));
555 assert_eq!(&mega_paragraph[s..e], c.content);
556 }
557 }
558
559 #[test]
560 fn chunk_config_default_is_500_50() {
561 let c = ChunkConfig::default();
562 assert_eq!(c.target_tokens, 500);
563 assert_eq!(c.overlap_tokens, 50);
564 }
565
566 #[test]
567 fn chunk_text_offsets_cover_input_modulo_overlap() {
568 let text = synthetic_text(80, 8);
570 let cfg = ChunkConfig::default();
571 let out = chunk_text(&text, &cfg);
572 assert!(!out.is_empty());
573 assert_eq!(out[0].start_offset, 0);
574 assert_eq!(out.last().unwrap().end_offset as usize, text.len());
575 }
576}