use super::types::{ChunkerConfig, DocumentChunk};
pub struct DocumentChunker {
config: ChunkerConfig,
}
impl DocumentChunker {
pub fn new() -> Self {
Self {
config: ChunkerConfig::default(),
}
}
pub fn with_config(config: ChunkerConfig) -> Self {
Self { config }
}
pub fn chunk(&self, document_id: &str, content: &str) -> Vec<DocumentChunk> {
if content.is_empty() {
return Vec::new();
}
let is_markdown = Self::looks_like_markdown(content);
if is_markdown && self.config.respect_headers {
self.chunk_markdown(document_id, content)
} else {
self.chunk_plain_text(document_id, content)
}
}
fn looks_like_markdown(content: &str) -> bool {
let sample = &content[..content.len().min(2000)];
sample.contains("\n# ")
|| sample.contains("\n## ")
|| sample.contains("\n### ")
|| sample.starts_with("# ")
|| sample.contains("```")
|| sample.contains("**")
|| sample.contains("[](")
}
fn chunk_markdown(&self, document_id: &str, content: &str) -> Vec<DocumentChunk> {
let mut chunks = Vec::new();
let mut current_section: Option<String> = None;
let mut section_content = String::new();
let mut section_start = 0;
for line in content.lines() {
if let Some(header) = Self::extract_markdown_header(line) {
if !section_content.trim().is_empty() {
self.add_section_chunks(
document_id,
§ion_content,
section_start,
current_section.as_deref(),
&mut chunks,
);
}
current_section = Some(header);
section_start = section_content.len();
section_content.clear();
}
section_content.push_str(line);
section_content.push('\n');
}
if !section_content.trim().is_empty() {
self.add_section_chunks(
document_id,
§ion_content,
section_start,
current_section.as_deref(),
&mut chunks,
);
}
let total = chunks.len() as u32;
for (i, chunk) in chunks.iter_mut().enumerate() {
chunk.chunk_index = i as u32;
chunk.total_chunks = total;
chunk.chunk_id = format!("{}:{}", document_id, i);
}
chunks
}
fn extract_markdown_header(line: &str) -> Option<String> {
let trimmed = line.trim();
if trimmed.starts_with('#') {
let hash_count = trimmed.chars().take_while(|c| *c == '#').count();
if hash_count <= 6 {
let header_text = trimmed[hash_count..].trim();
if !header_text.is_empty() {
return Some(header_text.to_string());
}
}
}
None
}
fn add_section_chunks(
&self,
document_id: &str,
content: &str,
base_offset: usize,
section: Option<&str>,
chunks: &mut Vec<DocumentChunk>,
) {
if self.config.respect_paragraphs {
self.chunk_by_paragraphs(document_id, content, base_offset, section, chunks);
} else {
self.chunk_by_size(document_id, content, base_offset, section, chunks);
}
}
fn chunk_plain_text(&self, document_id: &str, content: &str) -> Vec<DocumentChunk> {
let mut chunks = Vec::new();
if self.config.respect_paragraphs {
self.chunk_by_paragraphs(document_id, content, 0, None, &mut chunks);
} else {
self.chunk_by_size(document_id, content, 0, None, &mut chunks);
}
let total = chunks.len() as u32;
for (i, chunk) in chunks.iter_mut().enumerate() {
chunk.chunk_index = i as u32;
chunk.total_chunks = total;
chunk.chunk_id = format!("{}:{}", document_id, i);
}
chunks
}
fn chunk_by_paragraphs(
&self,
document_id: &str,
content: &str,
base_offset: usize,
section: Option<&str>,
chunks: &mut Vec<DocumentChunk>,
) {
let paragraphs: Vec<&str> = content.split("\n\n").collect();
let mut current_chunk = String::new();
let mut chunk_start = 0;
let mut current_offset = 0;
for (i, para) in paragraphs.iter().enumerate() {
let para_trimmed = para.trim();
if para_trimmed.is_empty() {
current_offset += para.len() + 2; continue;
}
let would_exceed =
current_chunk.len() + para_trimmed.len() > self.config.target_chunk_size;
let _is_last = i == paragraphs.len() - 1;
if would_exceed && !current_chunk.is_empty() {
let chunk_content = current_chunk.trim().to_string();
if chunk_content.len() >= self.config.min_chunk_size {
let mut chunk = DocumentChunk::new(
document_id.to_string(),
chunk_content,
base_offset + chunk_start,
base_offset + current_offset,
0, 0, );
if let Some(s) = section {
chunk = chunk.with_section(s.to_string());
}
chunks.push(chunk);
}
current_chunk = self.create_overlap(¤t_chunk);
chunk_start = current_offset.saturating_sub(self.config.overlap_size);
}
if !current_chunk.is_empty() {
current_chunk.push_str("\n\n");
}
current_chunk.push_str(para_trimmed);
current_offset += para.len() + 2;
if current_chunk.len() > self.config.max_chunk_size {
self.split_long_chunk(
document_id,
¤t_chunk,
base_offset + chunk_start,
section,
chunks,
);
current_chunk.clear();
chunk_start = current_offset;
}
}
let chunk_content = current_chunk.trim().to_string();
if !chunk_content.is_empty()
&& (chunks.is_empty() || chunk_content.len() >= self.config.min_chunk_size)
{
let mut chunk = DocumentChunk::new(
document_id.to_string(),
chunk_content,
base_offset + chunk_start,
base_offset + current_offset,
0,
0,
);
if let Some(s) = section {
chunk = chunk.with_section(s.to_string());
}
chunks.push(chunk);
}
}
fn create_overlap(&self, content: &str) -> String {
if content.len() <= self.config.overlap_size {
return content.to_string();
}
let overlap_region = &content[content.len() - self.config.overlap_size..];
if let Some(pos) = overlap_region.find(". ") {
return overlap_region[pos + 2..].to_string();
}
if let Some(pos) = overlap_region.find(' ') {
return overlap_region[pos + 1..].to_string();
}
overlap_region.to_string()
}
fn split_long_chunk(
&self,
document_id: &str,
content: &str,
base_offset: usize,
section: Option<&str>,
chunks: &mut Vec<DocumentChunk>,
) {
let sentences = Self::split_sentences(content);
if sentences.len() == 1 && content.len() > self.config.target_chunk_size {
self.split_by_words(document_id, content, base_offset, section, chunks);
return;
}
let mut current_chunk = String::new();
let mut chunk_start = 0;
let mut current_offset = 0;
for sentence in sentences {
if current_chunk.len() + sentence.len() > self.config.target_chunk_size
&& !current_chunk.is_empty()
{
let mut chunk = DocumentChunk::new(
document_id.to_string(),
current_chunk.trim().to_string(),
base_offset + chunk_start,
base_offset + current_offset,
0,
0,
);
if let Some(s) = section {
chunk = chunk.with_section(s.to_string());
}
chunks.push(chunk);
current_chunk = self.create_overlap(¤t_chunk);
chunk_start = current_offset.saturating_sub(self.config.overlap_size);
}
current_chunk.push_str(&sentence);
current_chunk.push(' ');
current_offset += sentence.len() + 1;
}
if !current_chunk.trim().is_empty() {
let mut chunk = DocumentChunk::new(
document_id.to_string(),
current_chunk.trim().to_string(),
base_offset + chunk_start,
base_offset + current_offset,
0,
0,
);
if let Some(s) = section {
chunk = chunk.with_section(s.to_string());
}
chunks.push(chunk);
}
}
fn split_by_words(
&self,
document_id: &str,
content: &str,
base_offset: usize,
section: Option<&str>,
chunks: &mut Vec<DocumentChunk>,
) {
let words: Vec<&str> = content.split_whitespace().collect();
let mut current_chunk = String::new();
let mut chunk_start = 0;
let mut current_offset = 0;
for word in words {
if current_chunk.len() + word.len() + 1 > self.config.target_chunk_size
&& !current_chunk.is_empty()
{
let mut chunk = DocumentChunk::new(
document_id.to_string(),
current_chunk.trim().to_string(),
base_offset + chunk_start,
base_offset + current_offset,
0,
0,
);
if let Some(s) = section {
chunk = chunk.with_section(s.to_string());
}
chunks.push(chunk);
current_chunk = self.create_overlap(¤t_chunk);
chunk_start = current_offset.saturating_sub(self.config.overlap_size);
}
if !current_chunk.is_empty() {
current_chunk.push(' ');
}
current_chunk.push_str(word);
current_offset += word.len() + 1;
}
if !current_chunk.trim().is_empty() {
let mut chunk = DocumentChunk::new(
document_id.to_string(),
current_chunk.trim().to_string(),
base_offset + chunk_start,
base_offset + current_offset,
0,
0,
);
if let Some(s) = section {
chunk = chunk.with_section(s.to_string());
}
chunks.push(chunk);
}
}
fn split_sentences(text: &str) -> Vec<String> {
let mut sentences = Vec::new();
let mut current = String::new();
for c in text.chars() {
current.push(c);
if (c == '.' || c == '!' || c == '?') && current.len() > 1 {
let lower = current.to_lowercase();
let is_abbreviation = lower.ends_with("mr.")
|| lower.ends_with("mrs.")
|| lower.ends_with("dr.")
|| lower.ends_with("vs.")
|| lower.ends_with("etc.")
|| lower.ends_with("e.g.")
|| lower.ends_with("i.e.");
if !is_abbreviation {
sentences.push(current.trim().to_string());
current = String::new();
}
}
}
if !current.trim().is_empty() {
sentences.push(current.trim().to_string());
}
sentences
}
fn chunk_by_size(
&self,
document_id: &str,
content: &str,
base_offset: usize,
section: Option<&str>,
chunks: &mut Vec<DocumentChunk>,
) {
let mut start = 0;
while start < content.len() {
let end = (start + self.config.target_chunk_size).min(content.len());
let actual_end = if end < content.len() {
content[start..end]
.rfind(' ')
.map(|pos| start + pos + 1)
.unwrap_or(end)
} else {
end
};
let chunk_content = content[start..actual_end].trim().to_string();
if chunk_content.len() >= self.config.min_chunk_size {
let mut chunk = DocumentChunk::new(
document_id.to_string(),
chunk_content,
base_offset + start,
base_offset + actual_end,
0,
0,
);
if let Some(s) = section {
chunk = chunk.with_section(s.to_string());
}
chunks.push(chunk);
}
start = actual_end.saturating_sub(self.config.overlap_size);
if start
<= chunks
.last()
.map(|c| c.start_offset - base_offset)
.unwrap_or(0)
{
start = actual_end;
}
}
}
}
impl Default for DocumentChunker {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_empty_content() {
let chunker = DocumentChunker::new();
let chunks = chunker.chunk("doc1", "");
assert!(chunks.is_empty());
}
#[test]
fn test_small_content_single_chunk() {
let chunker = DocumentChunker::new();
let content = "This is a small document that fits in one chunk.";
let chunks = chunker.chunk("doc1", content);
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].content, content);
assert_eq!(chunks[0].chunk_index, 0);
assert_eq!(chunks[0].total_chunks, 1);
}
#[test]
fn test_paragraph_chunking() {
let chunker = DocumentChunker::with_config(ChunkerConfig {
target_chunk_size: 100,
max_chunk_size: 200,
min_chunk_size: 10,
overlap_size: 20,
respect_paragraphs: true,
respect_headers: false,
});
let content = "First paragraph with some content here.\n\nSecond paragraph with different content.\n\nThird paragraph to test chunking.";
let chunks = chunker.chunk("doc1", content);
assert!(!chunks.is_empty());
}
#[test]
fn test_markdown_header_detection() {
assert!(DocumentChunker::looks_like_markdown("# Title\n\nContent"));
assert!(DocumentChunker::looks_like_markdown(
"Some text\n## Subtitle\n"
));
assert!(DocumentChunker::looks_like_markdown("```rust\ncode\n```"));
assert!(!DocumentChunker::looks_like_markdown(
"Plain text without any markdown."
));
}
#[test]
fn test_markdown_header_extraction() {
assert_eq!(
DocumentChunker::extract_markdown_header("# Title"),
Some("Title".to_string())
);
assert_eq!(
DocumentChunker::extract_markdown_header("## Subtitle"),
Some("Subtitle".to_string())
);
assert_eq!(
DocumentChunker::extract_markdown_header("### Nested"),
Some("Nested".to_string())
);
assert_eq!(
DocumentChunker::extract_markdown_header("Regular text"),
None
);
}
#[test]
fn test_sentence_splitting() {
let sentences = DocumentChunker::split_sentences(
"First sentence. Second sentence! Third sentence? Fourth.",
);
assert_eq!(sentences.len(), 4);
assert_eq!(sentences[0], "First sentence.");
assert_eq!(sentences[1], "Second sentence!");
assert_eq!(sentences[2], "Third sentence?");
}
#[test]
fn test_abbreviations_not_split() {
let sentences =
DocumentChunker::split_sentences("Dr. Smith went to the store. He bought milk.");
assert_eq!(sentences.len(), 2);
}
#[test]
fn test_config_presets() {
let small = ChunkerConfig::small();
let large = ChunkerConfig::large();
assert!(small.target_chunk_size < large.target_chunk_size);
assert!(small.max_chunk_size < large.max_chunk_size);
}
}