pub mod semantic;
use crate::models::{Chunk, Document};
use crate::traits::{Chunker, Result};
pub struct RecursiveCharacterChunker {
pub chunk_size: usize,
pub chunk_overlap: usize,
}
impl RecursiveCharacterChunker {
pub fn new(chunk_size: usize, chunk_overlap: usize) -> Self {
Self {
chunk_size,
chunk_overlap,
}
}
}
impl Chunker for RecursiveCharacterChunker {
fn chunk(&self, document: &Document) -> Result<Vec<Chunk>> {
let mut chunks = Vec::new();
let content = &document.content;
if content.is_empty() {
return Ok(chunks);
}
let mut start = 0;
let mut index = 0;
while start < content.len() {
let end = (start + self.chunk_size).min(content.len());
let mut safe_end = end;
while !content.is_char_boundary(safe_end) && safe_end > start {
safe_end -= 1;
}
if safe_end == start {
safe_end = end;
}
let text = content[start..safe_end].to_string();
chunks.push(Chunk {
document_id: document.id.clone(),
index,
text,
});
if safe_end == content.len() {
break;
}
start = safe_end.saturating_sub(self.chunk_overlap);
while !content.is_char_boundary(start) && start > 0 {
start -= 1;
}
index += 1;
}
Ok(chunks)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_recursive_chunker() {
let chunker = RecursiveCharacterChunker::new(10, 2);
let doc = Document::new("hello world this is a test");
let chunks = chunker.chunk(&doc).unwrap();
assert!(!chunks.is_empty());
assert_eq!(chunks[0].text, "hello worl");
}
#[test]
fn test_empty_document_produces_no_chunks() {
let chunker = RecursiveCharacterChunker::new(100, 10);
let doc = Document::new("");
let chunks = chunker.chunk(&doc).unwrap();
assert!(chunks.is_empty());
}
#[test]
fn test_content_smaller_than_chunk_size() {
let chunker = RecursiveCharacterChunker::new(1000, 100);
let doc = Document::new("short");
let chunks = chunker.chunk(&doc).unwrap();
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].text, "short");
}
#[test]
fn test_exact_chunk_size() {
let chunker = RecursiveCharacterChunker::new(5, 0);
let doc = Document::new("12345");
let chunks = chunker.chunk(&doc).unwrap();
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].text, "12345");
}
#[test]
fn test_chunk_indices_are_sequential() {
let chunker = RecursiveCharacterChunker::new(5, 0);
let doc = Document::new("abcdefghijklmnopqrstuvwxyz");
let chunks = chunker.chunk(&doc).unwrap();
for (i, chunk) in chunks.iter().enumerate() {
assert_eq!(chunk.index, i);
}
}
#[test]
fn test_chunk_document_id_matches() {
let chunker = RecursiveCharacterChunker::new(10, 0);
let doc = Document::new("some text that needs chunking");
let doc_id = doc.id.clone();
let chunks = chunker.chunk(&doc).unwrap();
for chunk in &chunks {
assert_eq!(chunk.document_id, doc_id);
}
}
#[test]
fn test_overlap_creates_overlapping_content() {
let chunker = RecursiveCharacterChunker::new(10, 3);
let doc = Document::new("0123456789ABCDEF");
let chunks = chunker.chunk(&doc).unwrap();
assert!(chunks.len() >= 2);
}
#[test]
fn test_unicode_content_does_not_panic() {
let chunker = RecursiveCharacterChunker::new(5, 1);
let doc = Document::new("héllo wörld 🦀 rust");
let result = chunker.chunk(&doc);
assert!(result.is_ok());
assert!(!result.unwrap().is_empty());
}
}