#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ChunkConfig {
pub target_tokens: u32,
pub overlap_tokens: u32,
}
impl Default for ChunkConfig {
fn default() -> Self {
Self {
target_tokens: 500,
overlap_tokens: 50,
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ChunkSpec {
pub content: String,
pub start_offset: u32,
pub end_offset: u32,
pub token_count: u32,
}
pub(crate) fn approx_token_count(text: &str) -> u32 {
let chars = text.chars().count();
u32::try_from(chars / 4).unwrap_or(u32::MAX)
}
pub fn chunk_text(text: &str, config: &ChunkConfig) -> Vec<ChunkSpec> {
if text.is_empty() {
return Vec::new();
}
let target = config.target_tokens.max(1);
let overlap = config.overlap_tokens.min(target.saturating_sub(1));
let total_tokens = approx_token_count(text);
if total_tokens <= target.saturating_mul(3) / 2 {
return vec![ChunkSpec {
content: text.to_string(),
start_offset: 0,
end_offset: u32::try_from(text.len()).unwrap_or(u32::MAX),
token_count: total_tokens,
}];
}
let paragraphs = split_paragraphs(text);
let oversize_threshold = target.saturating_mul(3) / 2;
let mut chunks: Vec<ChunkSpec> = Vec::new();
let mut cursor_start: usize = 0; let mut cursor_end: usize = 0; let mut cursor_tokens: u32 = 0;
for p in ¶graphs {
let p_tokens = approx_token_count(&text[p.start..p.end]);
if p_tokens >= oversize_threshold {
if cursor_end > cursor_start {
push_chunk(&mut chunks, text, cursor_start, cursor_end);
}
slide_window(&mut chunks, text, p.start, p.end, target, overlap);
cursor_start = window_overlap_start(text, p.end, overlap);
cursor_end = cursor_start;
cursor_tokens = 0;
continue;
}
if cursor_end > cursor_start && cursor_tokens + p_tokens > oversize_threshold {
push_chunk(&mut chunks, text, cursor_start, cursor_end);
cursor_start = window_overlap_start(text, cursor_end, overlap);
}
cursor_end = p.end;
cursor_tokens = approx_token_count(&text[cursor_start..cursor_end]);
if cursor_tokens >= target {
push_chunk(&mut chunks, text, cursor_start, cursor_end);
cursor_start = window_overlap_start(text, cursor_end, overlap);
cursor_end = cursor_start;
}
}
if cursor_end > cursor_start {
push_chunk(&mut chunks, text, cursor_start, cursor_end);
}
chunks
}
#[derive(Debug, Clone, Copy)]
struct Paragraph {
start: usize,
end: usize,
}
fn split_paragraphs(text: &str) -> Vec<Paragraph> {
let bytes = text.as_bytes();
let n = bytes.len();
let mut out = Vec::new();
let mut start = 0usize;
let mut i = 0usize;
while i < n {
if i + 1 < n && bytes[i] == b'\n' && bytes[i + 1] == b'\n' {
let mut j = i + 2;
while j < n && bytes[j] == b'\n' {
j += 1;
}
out.push(Paragraph { start, end: j });
start = j;
i = j;
continue;
}
i += 1;
}
if start < n {
out.push(Paragraph { start, end: n });
}
out
}
fn window_overlap_start(text: &str, end: usize, overlap: u32) -> usize {
if overlap == 0 || end == 0 {
return end;
}
let target_chars = (overlap as usize) * 4;
let mut count = 0usize;
let prefix = &text[..end];
for (idx, _ch) in prefix.char_indices().rev() {
count += 1;
if count > target_chars {
return idx;
}
}
0
}
fn push_chunk(out: &mut Vec<ChunkSpec>, text: &str, start: usize, end: usize) {
debug_assert!(start < end, "push_chunk: empty range [{start},{end})");
debug_assert!(text.is_char_boundary(start), "start {start} not on char boundary");
debug_assert!(text.is_char_boundary(end), "end {end} not on char boundary");
let slice = &text[start..end];
out.push(ChunkSpec {
content: slice.to_string(),
start_offset: u32::try_from(start).unwrap_or(u32::MAX),
end_offset: u32::try_from(end).unwrap_or(u32::MAX),
token_count: approx_token_count(slice),
});
}
fn slide_window(
out: &mut Vec<ChunkSpec>,
text: &str,
range_start: usize,
range_end: usize,
target: u32,
overlap: u32,
) {
let target_chars = (target as usize) * 4;
let mut window_start = range_start;
while window_start < range_end {
let mut chars_seen = 0usize;
let mut window_end = range_end;
let suffix = &text[window_start..range_end];
for (idx, _ch) in suffix.char_indices() {
chars_seen += 1;
if chars_seen >= target_chars {
window_end = window_start + idx;
break;
}
}
if window_end < range_end {
let lookback = target_chars / 10;
let snap = find_sentence_break(text, window_start, window_end, lookback);
window_end = snap;
}
if window_end <= window_start {
let next = text[window_start..range_end]
.char_indices()
.next()
.map(|(_, c)| window_start + c.len_utf8())
.unwrap_or(range_end);
window_end = next;
}
push_chunk(out, text, window_start, window_end);
if window_end >= range_end {
break;
}
let next_start = window_overlap_start(text, window_end, overlap);
window_start = next_start.max(window_start + 1);
while window_start < range_end && !text.is_char_boundary(window_start) {
window_start += 1;
}
}
}
fn find_sentence_break(
text: &str,
window_start: usize,
window_end: usize,
lookback_chars: usize,
) -> usize {
let bytes = text.as_bytes();
let look_start = {
let prefix = &text[window_start..window_end];
let mut count = 0usize;
let mut start_idx = 0usize;
for (idx, _ch) in prefix.char_indices().rev() {
count += 1;
if count >= lookback_chars {
start_idx = window_start + idx;
break;
}
}
if count < lookback_chars {
window_start
} else {
start_idx
}
};
let mut i = window_end;
while i > look_start {
let prev = match text[..i].char_indices().next_back() {
Some((idx, _)) => idx,
None => break,
};
let ch = bytes[prev];
if ch == b'.' || ch == b'!' || ch == b'?' || ch == b'\n' {
return i; }
i = prev;
}
window_end
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn chunk_empty_text_returns_empty_vec() {
let out = chunk_text("", &ChunkConfig::default());
assert!(out.is_empty());
}
#[test]
fn chunk_short_text_returns_single_chunk() {
let text = "Hello world. This is a tiny doc.";
let out = chunk_text(text, &ChunkConfig::default());
assert_eq!(out.len(), 1);
assert_eq!(out[0].content, text);
assert_eq!(out[0].start_offset, 0);
assert_eq!(out[0].end_offset as usize, text.len());
}
fn synthetic_text(paragraph_count: usize, words_per_paragraph: usize) -> String {
let mut s = String::new();
for p in 0..paragraph_count {
for w in 0..words_per_paragraph {
if w > 0 {
s.push(' ');
}
s.push_str(&format!("word{w:02}"));
}
s.push('.');
if p + 1 < paragraph_count {
s.push_str("\n\n");
}
}
s
}
#[test]
fn chunk_long_text_splits_into_multiple() {
let text = synthetic_text(500, 8); let cfg = ChunkConfig::default();
let out = chunk_text(&text, &cfg);
assert!(out.len() > 1, "expected multiple chunks, got {}", out.len());
for c in &out {
let slice = &text[c.start_offset as usize..c.end_offset as usize];
assert_eq!(slice, c.content.as_str());
}
}
#[test]
fn chunk_respects_paragraph_boundaries() {
let text = synthetic_text(40, 6);
let cfg = ChunkConfig {
target_tokens: 50,
overlap_tokens: 5,
};
let out = chunk_text(&text, &cfg);
assert!(out.len() > 1);
for c in &out {
let last_char = c.content.chars().last().unwrap();
assert!(
last_char == '.' || last_char == '\n' || last_char.is_ascii_alphanumeric(),
"chunk ends mid-token at: {:?}",
&c.content[c.content.len().saturating_sub(20)..]
);
}
}
#[test]
fn chunk_target_size_band() {
let text = synthetic_text(300, 8);
let cfg = ChunkConfig {
target_tokens: 100,
overlap_tokens: 10,
};
let out = chunk_text(&text, &cfg);
assert!(out.len() >= 3, "need enough chunks to evaluate band");
let lower = cfg.target_tokens / 2;
let upper = cfg.target_tokens * 3 / 2;
for (i, c) in out.iter().enumerate().take(out.len() - 1) {
assert!(
c.token_count >= lower && c.token_count <= upper,
"chunk {i} out of band: token_count={} band=[{lower},{upper}]",
c.token_count,
);
}
}
#[test]
fn chunk_offsets_monotonic_with_overlap() {
let text = synthetic_text(200, 8);
let cfg = ChunkConfig {
target_tokens: 100,
overlap_tokens: 10,
};
let out = chunk_text(&text, &cfg);
assert!(out.len() >= 2);
for window in out.windows(2) {
let a = &window[0];
let b = &window[1];
assert!(
b.end_offset > a.end_offset,
"end_offset must increase across chunks: {} -> {}",
a.end_offset,
b.end_offset
);
assert!(
b.start_offset <= a.end_offset,
"next chunk should overlap or abut the previous: a.end={} b.start={}",
a.end_offset,
b.start_offset,
);
}
}
#[test]
fn chunk_utf8_safe_offsets() {
let para = "こんにちは世界。これは日本語のテストです。Caféの紅茶。".repeat(40);
let text = format!(
"{para}\n\n{}",
"Bonjour le monde. Voici un test en français.".repeat(40)
);
let cfg = ChunkConfig {
target_tokens: 80,
overlap_tokens: 10,
};
let out = chunk_text(&text, &cfg);
assert!(!out.is_empty());
for c in &out {
let s = c.start_offset as usize;
let e = c.end_offset as usize;
assert!(text.is_char_boundary(s), "start {s} not on char boundary");
assert!(text.is_char_boundary(e), "end {e} not on char boundary");
assert_eq!(&text[s..e], c.content);
}
}
#[test]
fn chunk_very_large_text() {
let text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. ".repeat(1_800);
assert!(text.len() > 100_000);
let cfg = ChunkConfig::default();
let start = std::time::Instant::now();
let out = chunk_text(&text, &cfg);
let elapsed = start.elapsed();
assert!(out.len() > 10, "expected many chunks, got {}", out.len());
assert!(
elapsed < std::time::Duration::from_secs(2),
"100KB chunking too slow: {elapsed:?}"
);
}
#[test]
fn chunk_token_count_approximation() {
assert_eq!(approx_token_count("0123456789".repeat(4).as_str()), 10);
assert_eq!(approx_token_count("あいうえおかきくけこさし"), 3);
assert_eq!(approx_token_count(""), 0);
}
#[test]
fn chunk_oversized_paragraph_slides_window() {
let sentence = "This is a sentence with several words in it. ".to_string();
let mega_paragraph = sentence.repeat(200); let cfg = ChunkConfig {
target_tokens: 100,
overlap_tokens: 10,
};
let out = chunk_text(&mega_paragraph, &cfg);
assert!(
out.len() >= 3,
"expected oversized paragraph to be split, got {} chunks",
out.len()
);
for c in &out {
let s = c.start_offset as usize;
let e = c.end_offset as usize;
assert!(mega_paragraph.is_char_boundary(s));
assert!(mega_paragraph.is_char_boundary(e));
assert_eq!(&mega_paragraph[s..e], c.content);
}
}
#[test]
fn chunk_config_default_is_500_50() {
let c = ChunkConfig::default();
assert_eq!(c.target_tokens, 500);
assert_eq!(c.overlap_tokens, 50);
}
#[test]
fn chunk_text_offsets_cover_input_modulo_overlap() {
let text = synthetic_text(80, 8);
let cfg = ChunkConfig::default();
let out = chunk_text(&text, &cfg);
assert!(!out.is_empty());
assert_eq!(out[0].start_offset, 0);
assert_eq!(out.last().unwrap().end_offset as usize, text.len());
}
}