#[derive(Debug, Clone)]
pub struct ChunkerConfig {
pub min_chunk_chars: usize,
pub max_chunk_chars: usize,
}
impl Default for ChunkerConfig {
fn default() -> Self {
Self {
min_chunk_chars: 20,
max_chunk_chars: 250,
}
}
}
#[derive(Debug, Default)]
pub struct SentenceChunker {
buffer: String,
config: ChunkerConfig,
}
impl SentenceChunker {
pub fn new() -> Self {
Self::default()
}
pub fn with_config(config: ChunkerConfig) -> Self {
Self {
buffer: String::default(),
config,
}
}
pub fn push_token(&mut self, token: &str) -> Vec<String> {
self.buffer.push_str(token);
self.emit_all()
}
fn emit_all(&mut self) -> Vec<String> {
let mut results = Vec::new();
while let Some(sentence) = self.try_emit() {
results.push(sentence);
}
results
}
pub fn force_flush(&mut self) -> Option<String> {
if self.buffer.trim().is_empty() {
self.buffer.clear();
return None;
}
let text = std::mem::take(&mut self.buffer);
Some(text)
}
fn try_emit(&mut self) -> Option<String> {
if self.buffer.len() > self.config.max_chunk_chars {
return self.force_flush_at_best_point();
}
if let Some(pos) = self.buffer.find("\n\n") {
let split_pos = pos + 2; let candidate = self.buffer[..split_pos].trim().to_string();
if candidate.is_empty() {
self.buffer = self.buffer[split_pos..].to_string();
return None;
}
if candidate.len() >= self.config.min_chunk_chars {
self.buffer = self.buffer[split_pos..].to_string();
return Some(candidate);
}
return None;
}
let mut search_from: usize = 0;
loop {
match self.find_sentence_boundary_from(search_from) {
Some((split_pos, _)) => {
let candidate = self.buffer[..split_pos].trim().to_string();
if candidate.len() >= self.config.min_chunk_chars {
self.buffer = self.buffer[split_pos..].to_string();
return Some(candidate);
}
search_from = split_pos;
}
None => return None,
}
}
}
fn find_sentence_boundary_from(&self, from_byte: usize) -> Option<(usize, char)> {
let bytes = self.buffer.as_bytes();
let chars: Vec<(usize, char)> = self.buffer.char_indices().collect();
for (idx, &(byte_pos, ch)) in chars.iter().enumerate() {
if byte_pos < from_byte {
continue;
}
if !matches!(ch, '.' | '!' | '?') {
continue;
}
let after_punct = byte_pos + ch.len_utf8();
if ch == '.' && self.is_decimal_at(byte_pos, &chars, idx) {
continue;
}
if after_punct >= bytes.len() {
return Some((after_punct, ch));
}
let remainder = &self.buffer[after_punct..];
if self.starts_with_whitespace_then_upper(remainder) {
return Some((after_punct, ch));
}
}
None
}
fn is_decimal_at(&self, _byte_pos: usize, chars: &[(usize, char)], char_idx: usize) -> bool {
if char_idx == 0 {
return false;
}
let prev_char = chars[char_idx - 1].1;
if !prev_char.is_ascii_digit() {
return false;
}
if char_idx + 1 < chars.len() {
let next_char = chars[char_idx + 1].1;
return next_char.is_ascii_digit();
}
false
}
fn starts_with_whitespace_then_upper(&self, s: &str) -> bool {
let mut chars = s.chars();
match chars.next() {
Some(c) if c.is_whitespace() => {}
_ => return false,
}
for c in chars {
if c.is_whitespace() {
continue;
}
return c.is_uppercase();
}
false
}
fn force_flush_at_best_point(&mut self) -> Option<String> {
if let Some((split_pos, _)) = self.find_sentence_boundary_from(0) {
let candidate = self.buffer[..split_pos].trim().to_string();
if !candidate.is_empty() {
self.buffer = self.buffer[split_pos..].to_string();
return Some(candidate);
}
}
self.force_flush()
}
}
#[cfg(test)]
mod tests {
use super::*;
fn chunk_text(tokens: &[&str], config: ChunkerConfig) -> Vec<String> {
let mut chunker = SentenceChunker::with_config(config);
let mut results = Vec::new();
for token in tokens {
results.extend(chunker.push_token(token));
}
if let Some(remainder) = chunker.force_flush() {
results.push(remainder);
}
results
}
fn chunk_text_default(tokens: &[&str]) -> Vec<String> {
chunk_text(
tokens,
ChunkerConfig {
min_chunk_chars: 1,
max_chunk_chars: 250,
},
)
}
#[test]
fn test_decimal_no_split() {
let tokens = vec!["Price is $4.50. Buy now!"];
let result = chunk_text_default(&tokens);
assert_eq!(result, vec!["Price is $4.50.", "Buy now!"]);
}
#[test]
fn test_multiple_sentences() {
let tokens = vec!["Hello! How are you? Fine."];
let result = chunk_text_default(&tokens);
assert_eq!(result, vec!["Hello!", "How are you?", "Fine."]);
}
#[test]
fn test_force_flush_long_text() {
let config = ChunkerConfig {
min_chunk_chars: 1,
max_chunk_chars: 250,
};
let long_text = "a".repeat(300);
let tokens = vec![long_text.as_str()];
let result = chunk_text(&tokens, config);
assert_eq!(result.len(), 1);
assert_eq!(result[0], long_text);
}
#[test]
fn test_force_flush_remainder() {
let mut chunker = SentenceChunker::default();
chunker.push_token("Hello there");
let flushed = chunker.force_flush();
assert_eq!(flushed, Some("Hello there".to_string()));
}
#[test]
fn test_force_flush_empty() {
let mut chunker = SentenceChunker::default();
assert_eq!(chunker.force_flush(), None);
}
#[test]
fn test_force_flush_whitespace_only() {
let mut chunker = SentenceChunker::default();
chunker.push_token(" ");
assert_eq!(chunker.force_flush(), None);
}
#[test]
fn test_streaming_tokens() {
let tokens = vec![
"Hello", " ", "world", ".", " ", "How", " ", "are", " ", "you", "?",
];
let result = chunk_text_default(&tokens);
assert_eq!(result, vec!["Hello world.", "How are you?"]);
}
#[test]
fn test_paragraph_break() {
let tokens = vec!["First paragraph.\n\nSecond paragraph."];
let result = chunk_text_default(&tokens);
assert_eq!(result, vec!["First paragraph.", "Second paragraph."]);
}
#[test]
fn test_min_chunk_chars_holds() {
let config = ChunkerConfig {
min_chunk_chars: 20,
max_chunk_chars: 250,
};
let mut chunker = SentenceChunker::with_config(config);
assert!(chunker.push_token("Hi. ").is_empty());
let result = chunker.push_token("What is the meaning of life? I wonder.");
assert!(!result.is_empty());
assert!(result[0].len() >= 20);
}
#[test]
fn test_version_number_no_split() {
let tokens = vec!["Use v2.0 for this. It is better."];
let result = chunk_text_default(&tokens);
assert_eq!(result, vec!["Use v2.0 for this.", "It is better."]);
}
#[test]
fn test_exclamation_and_question() {
let tokens = vec!["Wow! Really? Yes."];
let result = chunk_text_default(&tokens);
assert_eq!(result, vec!["Wow!", "Really?", "Yes."]);
}
#[test]
fn test_max_chunk_with_boundary() {
let config = ChunkerConfig {
min_chunk_chars: 1,
max_chunk_chars: 50,
};
let tokens = vec![
"Short sentence here. And then a much longer sentence that pushes over the limit.",
];
let result = chunk_text(&tokens, config);
assert_eq!(result[0], "Short sentence here.");
assert!(result.len() >= 2);
}
}