pub trait Chunker: Send + Sync {
fn chunk(&self, text: &str) -> Vec<String>;
fn name(&self) -> &str;
}
pub struct NoChunker;
impl Chunker for NoChunker {
fn chunk(&self, text: &str) -> Vec<String> {
vec![text.to_owned()]
}
fn name(&self) -> &str {
"no_chunking"
}
}
pub struct SentenceChunker;
impl Chunker for SentenceChunker {
fn chunk(&self, text: &str) -> Vec<String> {
let mut chunks = Vec::new();
let mut current = String::new();
for ch in text.chars() {
current.push(ch);
if matches!(ch, '.' | '!' | '?') {
let trimmed = current.trim().to_owned();
if !trimmed.is_empty() {
chunks.push(trimmed);
}
current.clear();
}
}
let remainder = current.trim().to_owned();
if !remainder.is_empty() {
chunks.push(remainder);
}
if chunks.is_empty() {
chunks.push(text.to_owned());
}
chunks
}
fn name(&self) -> &str {
"sentence_chunking"
}
}
pub struct FixedSizeChunker {
pub chunk_size: usize,
pub overlap: usize,
}
impl FixedSizeChunker {
pub fn new(chunk_size: usize, overlap: usize) -> Self {
Self { chunk_size, overlap }
}
}
impl Chunker for FixedSizeChunker {
fn chunk(&self, text: &str) -> Vec<String> {
let words: Vec<&str> = text.split_whitespace().collect();
if words.is_empty() {
return vec![text.to_owned()];
}
let step = if self.chunk_size > self.overlap {
self.chunk_size - self.overlap
} else {
1
};
let mut chunks = Vec::new();
let mut start = 0;
while start < words.len() {
let end = (start + self.chunk_size).min(words.len());
chunks.push(words[start..end].join(" "));
start += step;
}
chunks
}
fn name(&self) -> &str {
"fixed_size_chunking"
}
}
pub struct ParagraphChunker;
impl Chunker for ParagraphChunker {
fn chunk(&self, text: &str) -> Vec<String> {
let chunks: Vec<String> = text
.split("\n\n")
.map(|s| s.trim().to_owned())
.filter(|s| !s.is_empty())
.collect();
if chunks.is_empty() {
vec![text.to_owned()]
} else {
chunks
}
}
fn name(&self) -> &str {
"paragraph_chunking"
}
}
pub struct WordChunker {
pub chunk_size: usize,
}
impl WordChunker {
pub fn new(chunk_size: usize) -> Self {
Self { chunk_size }
}
}
impl Chunker for WordChunker {
fn chunk(&self, text: &str) -> Vec<String> {
if self.chunk_size == 0 {
return vec![text.to_owned()];
}
let words: Vec<&str> = text.split_whitespace().collect();
if words.is_empty() {
return vec![text.to_owned()];
}
words
.chunks(self.chunk_size)
.map(|w| w.join(" "))
.collect()
}
fn name(&self) -> &str {
"word_chunking"
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn word_chunker_zero_chunk_size_returns_whole_text() {
let chunker = WordChunker::new(0);
let result = chunker.chunk("hello world foo bar");
assert_eq!(result, vec!["hello world foo bar"]);
}
#[test]
fn word_chunker_normal_chunking() {
let chunker = WordChunker::new(2);
let result = chunker.chunk("a b c d e");
assert_eq!(result, vec!["a b", "c d", "e"]);
}
#[test]
fn fixed_size_chunker_with_overlap() {
let chunker = FixedSizeChunker::new(3, 1);
let result = chunker.chunk("a b c d e");
assert_eq!(result, vec!["a b c", "c d e", "e"]);
}
}