use crate::tokens::estimate_tokens;
#[derive(Debug, Clone, Copy)]
pub struct ChunkOptions {
pub max_tokens: usize,
pub overlap_tokens: usize,
}
impl ChunkOptions {
pub fn new(max_tokens: usize, overlap_tokens: usize) -> Self {
let max_tokens = max_tokens.max(1);
let overlap_tokens = overlap_tokens.min(max_tokens.saturating_sub(1));
Self {
max_tokens,
overlap_tokens,
}
}
}
impl Default for ChunkOptions {
fn default() -> Self {
Self::new(512, 64)
}
}
fn is_terminator(ch: char) -> bool {
matches!(ch, '.' | '!' | '?' | '\u{3002}' | '\u{FF01}' | '\u{FF1F}')
}
fn segment(text: &str) -> Vec<String> {
let mut units = Vec::new();
let mut current = String::new();
for ch in text.chars() {
if ch == '\n' || ch == '\r' {
let trimmed = current.trim();
if !trimmed.is_empty() {
units.push(trimmed.to_string());
}
current.clear();
continue;
}
current.push(ch);
if is_terminator(ch) {
let trimmed = current.trim();
if !trimmed.is_empty() {
units.push(trimmed.to_string());
}
current.clear();
}
}
let trimmed = current.trim();
if !trimmed.is_empty() {
units.push(trimmed.to_string());
}
units
}
pub fn chunk_text(text: &str, opts: &ChunkOptions) -> Vec<String> {
let units = segment(text);
if units.is_empty() {
return Vec::new();
}
let mut chunks: Vec<String> = Vec::new();
let mut start: usize = 0;
let mut end: usize = 0;
while end < units.len() {
let running: String = units[start..=end].join(" ");
let running_tokens = estimate_tokens(&running);
if running_tokens <= opts.max_tokens {
end += 1;
continue;
}
if start == end {
chunks.push(running);
end += 1;
start = end;
continue;
}
let chunk_str: String = units[start..end].join(" ");
chunks.push(chunk_str);
let next_start = overlap_start(&units, start, end, opts.overlap_tokens);
start = if opts.overlap_tokens == 0 || end - start <= 1 {
end
} else {
let s = if next_start <= start {
end - 1
} else {
next_start
};
if s == end - 1 && estimate_tokens(&units[end - 1..=end].join(" ")) > opts.max_tokens {
end
} else {
s
}
};
}
if start < units.len() {
let chunk_str: String = units[start..].join(" ");
let trimmed = chunk_str.trim();
if !trimmed.is_empty() {
chunks.push(trimmed.to_string());
}
}
chunks
}
fn overlap_start(
units: &[String],
prev_start: usize,
prev_end: usize,
overlap_tokens: usize,
) -> usize {
if overlap_tokens == 0 || prev_end == 0 {
return prev_end;
}
let mut acc = String::new();
let mut new_start = prev_end;
for i in (prev_start..prev_end).rev() {
let candidate = if acc.is_empty() {
units[i].clone()
} else {
format!("{} {}", units[i], acc)
};
if estimate_tokens(&candidate) > overlap_tokens {
break;
}
acc = candidate;
new_start = i;
}
if new_start == prev_end {
new_start = prev_end.saturating_sub(1).max(prev_start);
}
new_start
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn empty_input_yields_no_chunks() {
let opts = ChunkOptions::new(100, 0);
assert!(chunk_text("", &opts).is_empty());
}
#[test]
fn whitespace_only_yields_no_chunks() {
let opts = ChunkOptions::new(100, 0);
assert!(chunk_text(" \n\t \n", &opts).is_empty());
}
#[test]
fn latin_packs_within_budget() {
let text = "The quick brown fox jumps. A lazy dog sleeps under the tree. \
Birds sing in the morning light. Rain falls softly on the roof. \
Children play in the garden all day.";
let opts = ChunkOptions::new(20, 8);
let chunks = chunk_text(text, &opts);
assert!(
chunks.len() > 1,
"expected multiple chunks, got {}",
chunks.len()
);
for (i, c) in chunks.iter().enumerate() {
let est = estimate_tokens(c);
assert!(
est <= opts.max_tokens,
"chunk {i} has {est} tokens > max {}",
opts.max_tokens
);
}
}
#[test]
fn latin_chunks_share_overlap() {
let text = "The quick brown fox jumps. A lazy dog sleeps under the tree. \
Birds sing in the morning light. Rain falls softly on the roof. \
Children play in the garden all day.";
let opts = ChunkOptions::new(20, 8);
let chunks = chunk_text(text, &opts);
assert!(chunks.len() > 1);
for w in chunks.windows(2) {
let last_sentence = w[0]
.split(|c: char| matches!(c, '.' | '!' | '?'))
.filter(|s| !s.trim().is_empty())
.next_back()
.unwrap_or("")
.trim();
assert!(
!last_sentence.is_empty(),
"expected a trailing sentence to carry into overlap"
);
assert!(
w[1].contains(last_sentence),
"overlap missing: '{last_sentence}' not in next chunk"
);
}
}
#[test]
fn cjk_fullstop_splits_and_packs() {
let text =
"今日は晴れます。明日は雨が降る。明後日は風が強い。夜は涼しいです。朝はとても寒い。";
let opts = ChunkOptions::new(10, 4);
let chunks = chunk_text(text, &opts);
assert!(
chunks.len() > 1,
"expected multiple CJK chunks, got {}",
chunks.len()
);
let joined = chunks.join("");
for clause in [
"今日は晴れます",
"明日は雨が降る",
"明後日は風が強い",
"夜は涼しいです",
"朝はとても寒い",
] {
assert!(joined.contains(clause), "clause '{clause}' was dropped");
}
for (i, c) in chunks.iter().enumerate() {
let est = estimate_tokens(c);
assert!(
est <= opts.max_tokens,
"CJK chunk {i} has {est} tokens > max {}",
opts.max_tokens
);
}
}
#[test]
fn cjk_exclamation_and_question_terminate() {
let text = "行きます!何をしますか?帰りましょう。";
let opts = ChunkOptions::new(8, 0);
let chunks = chunk_text(text, &opts);
let joined = chunks.join("");
for clause in ["行きます", "何をしますか", "帰りましょう"] {
assert!(joined.contains(clause), "clause '{clause}' was dropped");
}
}
#[test]
fn single_overlong_unit_emitted_unchanged() {
let text = "abcdefghijklmnopqrstuvwxyz0123456789abcdefghijklmnopqrstuvwxyz0123456789";
let opts = ChunkOptions::new(3, 0);
let chunks = chunk_text(text, &opts);
assert_eq!(
chunks.len(),
1,
"a lone overlong unit must be a single chunk"
);
assert_eq!(chunks[0], text, "content must be preserved verbatim");
}
#[test]
fn default_options_are_sensible() {
let opts = ChunkOptions::default();
assert_eq!(opts.max_tokens, 512);
assert_eq!(opts.overlap_tokens, 64);
assert!(opts.overlap_tokens < opts.max_tokens);
}
#[test]
fn overlap_clamped_below_max() {
let opts = ChunkOptions::new(10, 10);
assert!(opts.overlap_tokens < opts.max_tokens);
let text = "One. Two. Three. Four. Five. Six. Seven. Eight.";
let chunks = chunk_text(text, &opts);
assert!(chunks.len() > 1, "clamped overlap must still split");
}
#[test]
fn newlines_split_units() {
let text = "first line\nsecond line\nthird line";
let opts = ChunkOptions::new(5, 0);
let chunks = chunk_text(text, &opts);
assert!(chunks.len() > 1, "newlines should produce multiple units");
let joined = chunks.join(" ");
for line in ["first line", "second line", "third line"] {
assert!(joined.contains(line), "line '{line}' was dropped");
}
}
#[test]
fn overlap_preserved_for_small_chunks() {
let text = "One. Two. Three. Four. Five. Six. Seven. Eight. Nine. Ten. \
Eleven. Twelve.";
let opts = ChunkOptions::new(8, 7);
let chunks = chunk_text(text, &opts);
assert!(chunks.len() > 1, "expected a split, got {}", chunks.len());
for w in chunks.windows(2) {
let last = w[0]
.split(|c: char| matches!(c, '.' | '!' | '?'))
.filter(|s| !s.trim().is_empty())
.next_back()
.unwrap_or("")
.trim();
assert!(
!last.is_empty() && w[1].contains(last),
"overlap missing: '{last}' not carried into next chunk\n c0: {:?}\n c1: {:?}",
w[0],
w[1]
);
}
}
}