#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Chunk {
pub ordinal: usize,
pub byte_start: usize,
pub byte_end: usize,
pub text: String,
pub role: Option<String>,
pub session_id: Option<String>,
pub turn_id: Option<String>,
pub tool_name: Option<String>,
pub timestamp_unix: Option<i64>,
}
#[derive(Debug, Clone, Copy)]
pub struct ChunkOptions {
pub max_chars: usize,
pub min_chars: usize,
}
impl Default for ChunkOptions {
fn default() -> Self {
Self {
max_chars: 1200,
min_chars: 200,
}
}
}
pub fn chunk_text(text: &str, opts: ChunkOptions) -> Vec<Chunk> {
assert!(
opts.max_chars >= opts.min_chars && opts.max_chars > 0,
"invalid chunk options: max={}, min={}",
opts.max_chars,
opts.min_chars
);
if text.is_empty() {
return Vec::new();
}
let char_indices: Vec<(usize, char)> = text.char_indices().collect();
let n_chars = char_indices.len();
let mut chunks = Vec::new();
let mut start_char = 0usize;
let mut ordinal = 0usize;
while start_char < n_chars {
let end_target = (start_char + opts.max_chars).min(n_chars);
let end_char = if end_target == n_chars {
n_chars
} else {
pick_break(&char_indices, start_char, end_target, opts.min_chars)
};
let byte_start = char_indices[start_char].0;
let byte_end = if end_char == n_chars {
text.len()
} else {
char_indices[end_char].0
};
chunks.push(Chunk {
ordinal,
byte_start,
byte_end,
text: text[byte_start..byte_end].to_string(),
role: None,
session_id: None,
turn_id: None,
tool_name: None,
timestamp_unix: None,
});
ordinal += 1;
start_char = end_char;
}
chunks
}
fn pick_break(
chars: &[(usize, char)],
start_char: usize,
end_target: usize,
min_chars: usize,
) -> usize {
let lower = (start_char + min_chars).min(end_target);
for i in (lower..end_target).rev() {
if i >= 2 && chars[i - 1].1 == '\n' && chars[i - 2].1 == '\n' {
return i;
}
}
for i in (lower..end_target).rev() {
if i >= 1 && chars[i - 1].1 == '\n' {
return i;
}
}
for i in (lower..end_target).rev() {
if i >= 1 && chars[i - 1].1.is_whitespace() {
return i;
}
}
end_target
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn empty_input_yields_no_chunks() {
assert!(chunk_text("", ChunkOptions::default()).is_empty());
}
#[test]
fn short_input_is_single_chunk() {
let out = chunk_text("hello world", ChunkOptions::default());
assert_eq!(out.len(), 1);
assert_eq!(out[0].ordinal, 0);
assert_eq!(out[0].text, "hello world");
assert_eq!(out[0].byte_start, 0);
assert_eq!(out[0].byte_end, 11);
}
#[test]
fn output_is_deterministic() {
let text = "foo bar baz ".repeat(500);
let a = chunk_text(&text, ChunkOptions::default());
let b = chunk_text(&text, ChunkOptions::default());
assert_eq!(a, b);
}
#[test]
fn prefers_paragraph_break() {
let opts = ChunkOptions {
max_chars: 50,
min_chars: 10,
};
let a: String = "a".repeat(20);
let b: String = "b".repeat(40);
let text = format!("{a}\n\n{b}");
let out = chunk_text(&text, opts);
assert!(out.len() >= 2);
assert!(out[0].text.ends_with("\n\n"));
assert!(out[1].text.starts_with('b'));
}
#[test]
fn prefers_line_break_when_no_paragraph() {
let opts = ChunkOptions {
max_chars: 30,
min_chars: 5,
};
let line = "x".repeat(20);
let text = format!("{line}\n{line}");
let out = chunk_text(&text, opts);
assert!(out[0].text.ends_with('\n'));
}
#[test]
fn chunks_cover_full_text() {
let text: String = ('a'..='z').cycle().take(3000).collect();
let out = chunk_text(&text, ChunkOptions::default());
let rebuilt: String = out.iter().map(|c| c.text.as_str()).collect();
assert_eq!(rebuilt, text);
for pair in out.windows(2) {
assert_eq!(pair[0].byte_end, pair[1].byte_start);
assert_eq!(pair[0].ordinal + 1, pair[1].ordinal);
}
}
#[test]
fn handles_multibyte_utf8() {
let text = "世界".repeat(1000);
let out = chunk_text(&text, ChunkOptions::default());
let rebuilt: String = out.iter().map(|c| c.text.as_str()).collect();
assert_eq!(rebuilt, text);
for c in &out {
assert!(text.is_char_boundary(c.byte_start));
assert!(text.is_char_boundary(c.byte_end));
}
}
#[test]
fn hard_cut_when_no_whitespace_in_window() {
let opts = ChunkOptions {
max_chars: 10,
min_chars: 5,
};
let text = "abcdefghijklmno";
let out = chunk_text(text, opts);
assert_eq!(out.len(), 2);
assert_eq!(out[0].text, "abcdefghij");
assert_eq!(out[1].text, "klmno");
}
}