use std::sync::Arc;
use crate::character::validate_chunk_config;
use crate::chunk::{measured_spans, TextChunk, TextChunkIter, TextSpan};
use crate::error::ChunkError;
use crate::sizing::{CharSizer, ChunkConfig, ChunkSizer, FunctionSizer};
const DEFAULT_DELIMITERS: &[&str] = &[". ", "! ", "? ", "\n"];
#[derive(Clone)]
pub struct SentenceChunker<S = CharSizer> {
pub(crate) config: ChunkConfig<S>,
pub(crate) min_sentences_per_chunk: usize,
pub(crate) delimiters: Vec<String>,
pub(crate) min_characters_per_sentence: usize,
pub(crate) strip_whitespace: bool,
length_fn: crate::LengthFn,
}
impl SentenceChunker<CharSizer> {
pub fn new(chunk_size: usize, chunk_overlap: usize) -> Self {
Self {
config: ChunkConfig::new(chunk_size, chunk_overlap, CharSizer),
min_sentences_per_chunk: 1,
delimiters: DEFAULT_DELIMITERS.iter().map(|s| s.to_string()).collect(),
min_characters_per_sentence: 12,
strip_whitespace: true,
length_fn: Arc::new(crate::char_len),
}
}
pub fn builder() -> SentenceChunkerBuilder<CharSizer> {
SentenceChunkerBuilder::default()
}
}
impl<S> SentenceChunker<S>
where
S: ChunkSizer,
{
pub fn split_text(&self, text: &str) -> Vec<String> {
self.chunks(text)
.map(|chunk| chunk.text.to_string())
.collect()
}
pub fn chunks<'a>(&'a self, text: &'a str) -> impl Iterator<Item = TextChunk<'a>> + 'a {
let len_fn = self.length_fn.as_ref();
TextChunkIter::new(
text,
measured_spans(text, self.chunk_spans(text, len_fn).into_iter(), len_fn),
)
}
pub fn split_chunks<'a>(&'a self, text: &'a str) -> Vec<TextChunk<'a>> {
self.chunks(text).collect()
}
fn chunk_spans(&self, text: &str, len_fn: &dyn Fn(&str) -> usize) -> Vec<TextSpan> {
if text.is_empty() {
return Vec::new();
}
let mut sentences = split_into_sentence_spans(text, &self.delimiters);
sentences = merge_short_sentence_spans(text, sentences, self.min_characters_per_sentence);
if sentences.is_empty() {
return Vec::new();
}
let mut chunks: Vec<TextSpan> = Vec::new();
let mut current_sentences: Vec<usize> = Vec::new(); let mut current_len: usize = 0;
for (i, sentence) in sentences.iter().enumerate() {
let s_len = len_fn(sentence.text(text));
if current_sentences.is_empty() {
current_sentences.push(i);
current_len = s_len;
continue;
}
if current_len + s_len > self.config.chunk_size {
if current_sentences.len() < self.min_sentences_per_chunk {
current_sentences.push(i);
current_len += s_len;
continue;
}
if let Some(chunk) = self.join_sentence_spans(text, &sentences, ¤t_sentences)
{
chunks.push(chunk);
}
current_sentences.clear();
current_len = 0;
if self.config.chunk_overlap > 0 && i > 0 {
let prev_end = i; let mut overlap_start = prev_end;
let mut overlap_len: usize = 0;
while overlap_start > 0 {
let candidate = overlap_start - 1;
let candidate_len = len_fn(sentences[candidate].text(text));
if overlap_len + candidate_len > self.config.chunk_overlap {
break;
}
overlap_len += candidate_len;
overlap_start = candidate;
}
for j in overlap_start..prev_end {
current_sentences.push(j);
}
current_len = overlap_len;
}
current_sentences.push(i);
current_len += s_len;
} else {
current_sentences.push(i);
current_len += s_len;
}
}
if !current_sentences.is_empty() {
if let Some(chunk) = self.join_sentence_spans(text, &sentences, ¤t_sentences) {
chunks.push(chunk);
}
}
chunks
}
fn join_sentence_spans(
&self,
input: &str,
sentences: &[TextSpan],
indices: &[usize],
) -> Option<TextSpan> {
let start = sentences[*indices.first()?].start;
let end = sentences[*indices.last()?].end;
let span = TextSpan::new(start, end);
if self.strip_whitespace {
span.trim(input)
} else {
Some(span)
}
}
}
#[derive(Clone)]
pub struct SentenceChunkerBuilder<S = CharSizer> {
inner: SentenceChunker<S>,
}
impl Default for SentenceChunkerBuilder<CharSizer> {
fn default() -> Self {
Self {
inner: SentenceChunker::new(1000, 200),
}
}
}
impl<S> SentenceChunkerBuilder<S>
where
S: ChunkSizer,
{
pub fn chunk_size(mut self, chunk_size: usize) -> Self {
self.inner.config.chunk_size = chunk_size;
self
}
pub fn chunk_overlap(mut self, chunk_overlap: usize) -> Self {
self.inner.config.chunk_overlap = chunk_overlap;
self
}
pub fn min_sentences_per_chunk(mut self, min_sentences_per_chunk: usize) -> Self {
self.inner.min_sentences_per_chunk = min_sentences_per_chunk;
self
}
pub fn delimiters(mut self, delimiters: impl IntoIterator<Item = impl Into<String>>) -> Self {
self.inner.delimiters = delimiters.into_iter().map(Into::into).collect();
self
}
pub fn min_characters_per_sentence(mut self, min_characters_per_sentence: usize) -> Self {
self.inner.min_characters_per_sentence = min_characters_per_sentence;
self
}
pub fn strip_whitespace(mut self, strip_whitespace: bool) -> Self {
self.inner.strip_whitespace = strip_whitespace;
self
}
pub fn sizer<T>(self, sizer: T) -> SentenceChunkerBuilder<T>
where
T: ChunkSizer,
{
let inner = self.inner;
let length_sizer = sizer.clone();
SentenceChunkerBuilder {
inner: SentenceChunker {
config: ChunkConfig::new(
inner.config.chunk_size,
inner.config.chunk_overlap,
sizer,
),
min_sentences_per_chunk: inner.min_sentences_per_chunk,
delimiters: inner.delimiters,
min_characters_per_sentence: inner.min_characters_per_sentence,
strip_whitespace: inner.strip_whitespace,
length_fn: Arc::new(move |value: &str| length_sizer.size(value)),
},
}
}
pub fn length_fn(self, length_fn: crate::LengthFn) -> SentenceChunkerBuilder<FunctionSizer> {
self.sizer(FunctionSizer::new(length_fn))
}
pub fn build(self) -> Result<SentenceChunker<S>, ChunkError> {
validate_chunk_config(
self.inner.config.chunk_size,
self.inner.config.chunk_overlap,
)?;
if self.inner.delimiters.is_empty() {
return Err(ChunkError::invalid_configuration(
"sentence chunker requires at least one delimiter",
));
}
if self.inner.min_sentences_per_chunk == 0 {
return Err(ChunkError::invalid_configuration(
"min_sentences_per_chunk must be greater than zero",
));
}
Ok(self.inner)
}
}
#[cfg(test)]
fn split_into_sentences(text: &str, delimiters: &[String]) -> Vec<String> {
let mut sentences: Vec<String> = Vec::new();
let mut remaining = text;
while !remaining.is_empty() {
let mut earliest_pos: Option<usize> = None;
let mut earliest_delim_len: usize = 0;
for delim in delimiters {
if let Some(pos) = remaining.find(delim.as_str()) {
match earliest_pos {
None => {
earliest_pos = Some(pos);
earliest_delim_len = delim.len();
}
Some(ep) => {
if pos < ep {
earliest_pos = Some(pos);
earliest_delim_len = delim.len();
}
}
}
}
}
match earliest_pos {
Some(pos) => {
let end = pos + earliest_delim_len;
let sentence = &remaining[..end];
if !sentence.is_empty() {
sentences.push(sentence.to_string());
}
remaining = &remaining[end..];
}
None => {
if !remaining.is_empty() {
sentences.push(remaining.to_string());
}
break;
}
}
}
sentences
}
fn split_into_sentence_spans(text: &str, delimiters: &[String]) -> Vec<TextSpan> {
let mut sentences = Vec::new();
let mut start = 0usize;
while start < text.len() {
let remaining = &text[start..];
let mut earliest_pos: Option<usize> = None;
let mut earliest_delim_len: usize = 0;
for delim in delimiters {
if let Some(pos) = remaining.find(delim.as_str()) {
if earliest_pos.is_none_or(|current| pos < current) {
earliest_pos = Some(pos);
earliest_delim_len = delim.len();
}
}
}
match earliest_pos {
Some(pos) => {
let end = start + pos + earliest_delim_len;
if start < end {
sentences.push(TextSpan::new(start, end));
}
start = end;
}
None => {
sentences.push(TextSpan::new(start, text.len()));
break;
}
}
}
sentences
}
#[cfg(test)]
fn merge_short_sentences(sentences: Vec<String>, min_chars: usize) -> Vec<String> {
if sentences.is_empty() {
return sentences;
}
let mut result: Vec<String> = Vec::new();
let mut buffer = String::new();
for sentence in sentences {
buffer.push_str(&sentence);
if buffer.chars().count() >= min_chars {
result.push(buffer);
buffer = String::new();
}
}
if !buffer.is_empty() {
if let Some(last) = result.last_mut() {
last.push_str(&buffer);
} else {
result.push(buffer);
}
}
result
}
fn merge_short_sentence_spans(
input: &str,
sentences: Vec<TextSpan>,
min_chars: usize,
) -> Vec<TextSpan> {
if sentences.is_empty() {
return sentences;
}
let mut result: Vec<TextSpan> = Vec::new();
let mut buffer_start: Option<usize> = None;
let mut buffer_end = 0usize;
for sentence in sentences {
let start = buffer_start.unwrap_or(sentence.start);
buffer_start = Some(start);
buffer_end = sentence.end;
let buffer = TextSpan::new(start, buffer_end);
if buffer.text(input).chars().count() >= min_chars {
result.push(buffer);
buffer_start = None;
}
}
if let Some(start) = buffer_start {
let buffer = TextSpan::new(start, buffer_end);
if let Some(last) = result.last_mut() {
last.end = buffer.end;
} else {
result.push(buffer);
}
}
result
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_sentence_chunker_basic() {
let chunker = SentenceChunker {
config: crate::sizing::ChunkConfig::new(30, 0, crate::sizing::CharSizer),
min_sentences_per_chunk: 1,
min_characters_per_sentence: 1,
..SentenceChunker::new(30, 0)
};
let result = chunker.split_text("Hello world. How are you? I am fine. Thank you.");
assert_eq!(result.len(), 2);
assert!(result[0].contains("Hello world."));
assert!(result[1].contains("Thank you."));
}
#[test]
fn test_sentence_chunker_no_delimiters() {
let chunker = SentenceChunker::new(100, 0);
let result = chunker.split_text("No delimiters in this text");
assert_eq!(result, vec!["No delimiters in this text"]);
}
#[test]
fn test_sentence_chunker_empty() {
let chunker = SentenceChunker::new(100, 0);
let result = chunker.split_text("");
assert!(result.is_empty());
}
#[test]
fn test_sentence_chunker_delimiter_at_end() {
let chunker = SentenceChunker {
min_sentences_per_chunk: 1,
min_characters_per_sentence: 1,
..SentenceChunker::new(100, 0)
};
let result = chunker.split_text("Hello world. ");
assert_eq!(result, vec!["Hello world."]);
}
#[test]
fn test_sentence_chunker_min_chars_filtering() {
let chunker = SentenceChunker {
config: crate::sizing::ChunkConfig::new(100, 0, crate::sizing::CharSizer),
min_sentences_per_chunk: 1,
min_characters_per_sentence: 15,
..SentenceChunker::new(100, 0)
};
let result = chunker.split_text("Hi. How are you doing today? Fine thanks.");
assert_eq!(result.len(), 1);
}
#[test]
fn test_sentence_chunker_overlap() {
let chunker = SentenceChunker {
config: crate::sizing::ChunkConfig::new(60, 35, crate::sizing::CharSizer),
min_sentences_per_chunk: 1,
min_characters_per_sentence: 1,
..SentenceChunker::new(60, 35)
};
let text = "Schemas define structure. Vectorizers create embeddings. Workers process pending rows. Queries retrieve semantic context.";
let result = chunker.split_text(text);
assert!(
result.len() >= 2,
"Expected multiple chunks, got {:?}",
result
);
assert!(
result[0].contains("Vectorizers create embeddings.")
&& result[1].contains("Vectorizers create embeddings."),
"Expected overlap sentence to appear in adjacent chunks: {:?}",
result
);
}
#[test]
fn test_split_into_sentences() {
let delimiters: Vec<String> = vec![". ", "! ", "? "]
.into_iter()
.map(String::from)
.collect();
let result = split_into_sentences("Hello world. How are you? Fine! Thanks.", &delimiters);
assert_eq!(
result,
vec!["Hello world. ", "How are you? ", "Fine! ", "Thanks."]
);
}
#[test]
fn test_merge_short_sentences() {
let sentences = vec![
"Hi. ".to_string(),
"How are you? ".to_string(),
"Good. ".to_string(),
];
let result = merge_short_sentences(sentences, 10);
assert_eq!(result, vec!["Hi. How are you? Good. "]);
}
#[test]
fn test_sentence_chunker_min_sentences_per_chunk() {
let chunker = SentenceChunker {
config: crate::sizing::ChunkConfig::new(7, 0, crate::sizing::CharSizer),
min_sentences_per_chunk: 2,
min_characters_per_sentence: 1,
..SentenceChunker::new(7, 0)
};
let result = chunker.split_text("A. B. C. D.");
assert_eq!(result, vec!["A. B.", "C. D."]);
}
#[test]
fn test_sentence_chunker_min_sentences_best_effort() {
let chunker = SentenceChunker {
config: crate::sizing::ChunkConfig::new(7, 0, crate::sizing::CharSizer),
min_sentences_per_chunk: 2,
min_characters_per_sentence: 1,
..SentenceChunker::new(7, 0)
};
let result = chunker.split_text("AAAA. BBBB. CCCC.");
assert_eq!(result.len(), 2);
assert!(result[0].contains("AAAA."));
assert!(result[0].contains("BBBB."));
}
}