mod timestamp;
pub use timestamp::TimestampChunker;
use crate::{Document, DocumentId, Error, Result};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
fn ceil_char_boundary(s: &str, i: usize) -> usize {
if i >= s.len() {
s.len()
} else {
let mut pos = i;
while pos < s.len() && !s.is_char_boundary(pos) {
pos += 1;
}
pos
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct ChunkId(pub uuid::Uuid);
impl ChunkId {
#[must_use]
pub fn new() -> Self {
Self(uuid::Uuid::new_v4())
}
}
impl Default for ChunkId {
fn default() -> Self {
Self::new()
}
}
impl std::fmt::Display for ChunkId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0)
}
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct ChunkMetadata {
pub title: Option<String>,
pub headers: Vec<String>,
pub page: Option<usize>,
pub custom: HashMap<String, serde_json::Value>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Chunk {
pub id: ChunkId,
pub document_id: DocumentId,
pub content: String,
pub start_offset: usize,
pub end_offset: usize,
pub metadata: ChunkMetadata,
pub embedding: Option<Vec<f32>>,
}
impl Chunk {
#[must_use]
pub fn new(
document_id: DocumentId,
content: String,
start_offset: usize,
end_offset: usize,
) -> Self {
Self {
id: ChunkId::new(),
document_id,
content,
start_offset,
end_offset,
metadata: ChunkMetadata::default(),
embedding: None,
}
}
#[must_use]
pub fn len(&self) -> usize {
self.content.len()
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.content.is_empty()
}
pub fn set_embedding(&mut self, embedding: Vec<f32>) {
contract_pre_embedding_lookup!(embedding);
self.embedding = Some(embedding);
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum ChunkingStrategy {
FixedSize {
chunk_size: usize,
overlap: usize,
},
Sentence {
max_sentences: usize,
overlap_sentences: usize,
},
Paragraph {
max_paragraphs: usize,
},
Recursive {
separators: Vec<String>,
chunk_size: usize,
overlap: usize,
},
}
impl Default for ChunkingStrategy {
fn default() -> Self {
Self::Recursive {
separators: vec![
"\n\n".to_string(),
"\n".to_string(),
". ".to_string(),
" ".to_string(),
],
chunk_size: 512,
overlap: 50,
}
}
}
pub trait Chunker: Send + Sync {
fn chunk(&self, document: &Document) -> Result<Vec<Chunk>>;
fn estimate_chunks(&self, document: &Document) -> usize;
}
#[derive(Debug, Clone)]
pub struct RecursiveChunker {
separators: Vec<String>,
chunk_size: usize,
overlap: usize,
}
impl RecursiveChunker {
#[must_use]
pub fn new(chunk_size: usize, overlap: usize) -> Self {
Self {
separators: vec![
"\n\n".to_string(),
"\n".to_string(),
". ".to_string(),
" ".to_string(),
],
chunk_size,
overlap,
}
}
#[must_use]
pub fn with_separators(mut self, separators: Vec<String>) -> Self {
self.separators = separators;
self
}
fn split_text(&self, text: &str, separator_idx: usize) -> Vec<String> {
if text.len() <= self.chunk_size {
return vec![text.to_string()];
}
if separator_idx >= self.separators.len() {
return self.split_by_chars(text);
}
let separator = &self.separators[separator_idx];
let parts: Vec<&str> = text.split(separator).collect();
if parts.len() == 1 {
return self.split_text(text, separator_idx + 1);
}
self.merge_splits(&parts, separator, separator_idx)
}
fn merge_splits(&self, parts: &[&str], separator: &str, separator_idx: usize) -> Vec<String> {
let mut chunks = Vec::new();
let mut current = String::new();
for part in parts {
let potential = if current.is_empty() {
(*part).to_string()
} else {
format!("{current}{separator}{part}")
};
if potential.len() <= self.chunk_size {
current = potential;
} else if current.is_empty() {
chunks.extend(self.split_text(part, separator_idx + 1));
} else {
chunks.push(current);
current = (*part).to_string();
}
}
if !current.is_empty() {
if current.len() <= self.chunk_size {
chunks.push(current);
} else {
chunks.extend(self.split_text(¤t, separator_idx + 1));
}
}
chunks
}
fn split_by_chars(&self, text: &str) -> Vec<String> {
let chars: Vec<char> = text.chars().collect();
let mut chunks = Vec::new();
let mut start = 0;
while start < chars.len() {
let end = (start + self.chunk_size).min(chars.len());
let chunk: String = chars[start..end].iter().collect();
chunks.push(chunk);
if end >= chars.len() {
break;
}
start = if end > self.overlap { end - self.overlap } else { end };
}
chunks
}
fn apply_overlap(&self, chunks: Vec<String>) -> Vec<String> {
if self.overlap == 0 || chunks.len() <= 1 {
return chunks;
}
let mut result = Vec::with_capacity(chunks.len());
for (i, chunk) in chunks.iter().enumerate() {
if i == 0 {
result.push(chunk.clone());
} else {
let prev = &chunks[i - 1];
let overlap_text = if prev.len() > self.overlap {
let start = prev.len() - self.overlap;
let start = ceil_char_boundary(prev, start);
&prev[start..]
} else {
prev.as_str()
};
result.push(format!("{overlap_text}{chunk}"));
}
}
result
}
}
impl Chunker for RecursiveChunker {
fn chunk(&self, document: &Document) -> Result<Vec<Chunk>> {
if document.content.is_empty() {
return Err(Error::EmptyDocument(
document.title.clone().unwrap_or_else(|| "untitled".to_string()),
));
}
let text_chunks = self.split_text(&document.content, 0);
let overlapped = self.apply_overlap(text_chunks);
let mut offset = 0;
let mut chunks = Vec::new();
for content in overlapped {
let safe_offset = ceil_char_boundary(&document.content, offset);
let start = document.content[safe_offset..]
.find(&content)
.map_or(safe_offset, |pos| safe_offset + pos);
let end = start + content.len();
let mut chunk = Chunk::new(document.id, content, start, end);
chunk.metadata.title = document.title.clone();
chunks.push(chunk);
offset = ceil_char_boundary(&document.content, start + 1);
}
Ok(chunks)
}
fn estimate_chunks(&self, document: &Document) -> usize {
if document.content.is_empty() {
return 0;
}
let effective_size = self.chunk_size.saturating_sub(self.overlap);
if effective_size == 0 {
return 1;
}
(document.content.len() + effective_size - 1) / effective_size
}
}
#[derive(Debug, Clone)]
pub struct FixedSizeChunker {
chunk_size: usize,
overlap: usize,
}
impl FixedSizeChunker {
#[must_use]
pub fn new(chunk_size: usize, overlap: usize) -> Self {
Self { chunk_size, overlap }
}
}
impl Chunker for FixedSizeChunker {
fn chunk(&self, document: &Document) -> Result<Vec<Chunk>> {
if document.content.is_empty() {
return Err(Error::EmptyDocument(
document.title.clone().unwrap_or_else(|| "untitled".to_string()),
));
}
let chars: Vec<char> = document.content.chars().collect();
let mut chunks = Vec::new();
let mut start = 0;
while start < chars.len() {
let end = (start + self.chunk_size).min(chars.len());
let content: String = chars[start..end].iter().collect();
let byte_start = chars[..start].iter().collect::<String>().len();
let byte_end = byte_start + content.len();
let mut chunk = Chunk::new(document.id, content, byte_start, byte_end);
chunk.metadata.title = document.title.clone();
chunks.push(chunk);
if end >= chars.len() {
break;
}
let step = self.chunk_size.saturating_sub(self.overlap);
start += if step == 0 { 1 } else { step };
}
Ok(chunks)
}
fn estimate_chunks(&self, document: &Document) -> usize {
if document.content.is_empty() {
return 0;
}
let step = self.chunk_size.saturating_sub(self.overlap);
if step == 0 {
return document.content.chars().count();
}
let char_count = document.content.chars().count();
(char_count + step - 1) / step
}
}
pub struct SemanticChunker<E: crate::embed::Embedder> {
embedder: E,
pub similarity_threshold: f32,
pub max_chunk_size: usize,
}
impl<E: crate::embed::Embedder> SemanticChunker<E> {
pub fn new(embedder: E, similarity_threshold: f32, max_chunk_size: usize) -> Self {
Self { embedder, similarity_threshold, max_chunk_size }
}
fn split_sentences(text: &str) -> Vec<&str> {
let mut sentences = Vec::new();
let mut start = 0;
for (i, c) in text.char_indices() {
if c == '.' || c == '!' || c == '?' {
let next_char = text[i + c.len_utf8()..].chars().next();
if next_char.map_or(true, |nc| nc.is_whitespace()) {
let end = i + c.len_utf8();
let sentence = text[start..end].trim();
if !sentence.is_empty() {
sentences.push(sentence);
}
start = end;
}
}
}
let remaining = text[start..].trim();
if !remaining.is_empty() {
sentences.push(remaining);
}
sentences
}
}
impl<E: crate::embed::Embedder> Chunker for SemanticChunker<E> {
fn chunk(&self, document: &Document) -> Result<Vec<Chunk>> {
if document.content.is_empty() {
return Err(Error::EmptyDocument(
document.title.clone().unwrap_or_else(|| "untitled".to_string()),
));
}
let sentences = Self::split_sentences(&document.content);
if sentences.is_empty() {
return Err(Error::EmptyDocument(
document.title.clone().unwrap_or_else(|| "untitled".to_string()),
));
}
if sentences.len() == 1 {
let content = sentences[0].to_string();
let start_offset = document.content.find(&content).unwrap_or(0);
let end_offset = start_offset + content.len();
let mut chunk = Chunk::new(document.id, content, start_offset, end_offset);
chunk.metadata.title = document.title.clone();
return Ok(vec![chunk]);
}
let embeddings: Vec<Vec<f32>> = sentences
.iter()
.map(|s| {
self.embedder.embed(s).unwrap_or_else(|e| {
eprintln!("Embedding failed for sentence: {e}");
vec![0.0; self.embedder.dimension()]
})
})
.collect();
let mut chunks = Vec::new();
let mut current_sentences: Vec<&str> = vec![sentences[0]];
let mut current_embedding = &embeddings[0];
for i in 1..sentences.len() {
let similarity = crate::embed::cosine_similarity(current_embedding, &embeddings[i]);
let current_len: usize = current_sentences.iter().map(|s| s.len()).sum();
if similarity < self.similarity_threshold
|| current_len + sentences[i].len() > self.max_chunk_size
{
let content = current_sentences.join(" ");
let start_offset = document.content.find(&content).unwrap_or(0);
let end_offset = start_offset + content.len();
let mut chunk = Chunk::new(document.id, content, start_offset, end_offset);
chunk.metadata.title = document.title.clone();
chunks.push(chunk);
current_sentences = vec![sentences[i]];
current_embedding = &embeddings[i];
} else {
current_sentences.push(sentences[i]);
}
}
if !current_sentences.is_empty() {
let content = current_sentences.join(" ");
let start_offset = document.content.find(&content).unwrap_or(0);
let end_offset = start_offset + content.len();
let mut chunk = Chunk::new(document.id, content, start_offset, end_offset);
chunk.metadata.title = document.title.clone();
chunks.push(chunk);
}
Ok(chunks)
}
fn estimate_chunks(&self, document: &Document) -> usize {
if document.content.is_empty() {
return 0;
}
let sentences = Self::split_sentences(&document.content);
(sentences.len() + 2) / 3 }
}
#[derive(Debug, Clone)]
pub struct StructuralChunker {
pub respect_headers: bool,
pub max_section_size: usize,
}
impl StructuralChunker {
#[must_use]
pub fn new(respect_headers: bool, max_section_size: usize) -> Self {
Self { respect_headers, max_section_size }
}
fn extract_header(line: &str) -> Option<String> {
let trimmed = line.trim();
if trimmed.starts_with('#') {
let header = trimmed.trim_start_matches('#').trim();
if !header.is_empty() {
return Some(header.to_string());
}
}
None
}
fn is_header(line: &str) -> bool {
Self::extract_header(line).is_some()
}
fn split_by_headers(text: &str) -> Vec<(Option<String>, String)> {
let mut sections = Vec::new();
let mut current_header: Option<String> = None;
let mut current_content = String::new();
for line in text.lines() {
if Self::is_header(line) {
if !current_content.trim().is_empty() || current_header.is_some() {
sections.push((current_header.take(), current_content.trim().to_string()));
current_content = String::new();
}
current_header = Self::extract_header(line);
current_content.push_str(line);
current_content.push('\n');
} else {
current_content.push_str(line);
current_content.push('\n');
}
}
if !current_content.trim().is_empty() {
sections.push((current_header, current_content.trim().to_string()));
}
sections
}
}
impl Chunker for StructuralChunker {
fn chunk(&self, document: &Document) -> Result<Vec<Chunk>> {
if document.content.is_empty() {
return Err(Error::EmptyDocument(
document.title.clone().unwrap_or_else(|| "untitled".to_string()),
));
}
let sections = if self.respect_headers {
Self::split_by_headers(&document.content)
} else {
vec![(None, document.content.clone())]
};
if sections.is_empty() {
return Err(Error::EmptyDocument(
document.title.clone().unwrap_or_else(|| "untitled".to_string()),
));
}
let mut chunks = Vec::new();
let doc_title = document.title.clone();
let doc_source = document.source.clone();
let doc_metadata = document.metadata.clone();
let sub_chunker = RecursiveChunker::new(self.max_section_size, 50);
for (header, content) in sections {
if content.is_empty() {
continue;
}
if content.len() > self.max_section_size {
let sub_doc = Document {
id: document.id,
content,
title: doc_title.clone(),
source: doc_source.clone(),
metadata: doc_metadata.clone(),
};
if let Ok(sub_chunks) = sub_chunker.chunk(&sub_doc) {
for mut chunk in sub_chunks {
if let Some(ref h) = header {
chunk.metadata.headers.push(h.clone());
}
chunks.push(chunk);
}
}
} else {
let start_offset = document.content.find(&content).unwrap_or(0);
let end_offset = start_offset + content.len();
let mut chunk = Chunk::new(document.id, content, start_offset, end_offset);
chunk.metadata.title = doc_title.clone();
if let Some(h) = header {
chunk.metadata.headers.push(h);
}
chunks.push(chunk);
}
}
if chunks.is_empty() {
let content = document.content.clone();
let mut chunk = Chunk::new(document.id, content, 0, document.content.len());
chunk.metadata.title = document.title.clone();
chunks.push(chunk);
}
Ok(chunks)
}
fn estimate_chunks(&self, document: &Document) -> usize {
if document.content.is_empty() {
return 0;
}
let sections = Self::split_by_headers(&document.content);
sections.len().max(1)
}
}
#[derive(Debug, Clone)]
pub struct ParagraphChunker {
max_paragraphs: usize,
}
impl ParagraphChunker {
#[must_use]
pub fn new(max_paragraphs: usize) -> Self {
Self { max_paragraphs }
}
fn split_paragraphs(text: &str) -> Vec<&str> {
text.split("\n\n").map(|p| p.trim()).filter(|p| !p.is_empty()).collect()
}
}
impl Chunker for ParagraphChunker {
fn chunk(&self, document: &Document) -> Result<Vec<Chunk>> {
if document.content.is_empty() {
return Err(Error::EmptyDocument(
document.title.clone().unwrap_or_else(|| "untitled".to_string()),
));
}
let paragraphs = Self::split_paragraphs(&document.content);
if paragraphs.is_empty() {
return Err(Error::EmptyDocument(
document.title.clone().unwrap_or_else(|| "untitled".to_string()),
));
}
let mut chunks = Vec::new();
let mut i = 0;
while i < paragraphs.len() {
let end = (i + self.max_paragraphs).min(paragraphs.len());
let content = paragraphs[i..end].join("\n\n");
let start_offset = document.content.find(&content).unwrap_or(0);
let end_offset = start_offset + content.len();
let mut chunk = Chunk::new(document.id, content, start_offset, end_offset);
chunk.metadata.title = document.title.clone();
chunks.push(chunk);
i = end;
}
Ok(chunks)
}
fn estimate_chunks(&self, document: &Document) -> usize {
if document.content.is_empty() {
return 0;
}
let paragraphs = Self::split_paragraphs(&document.content);
if self.max_paragraphs == 0 {
return paragraphs.len();
}
(paragraphs.len() + self.max_paragraphs - 1) / self.max_paragraphs
}
}
#[derive(Debug, Clone)]
pub struct SentenceChunker {
max_sentences: usize,
overlap_sentences: usize,
}
impl SentenceChunker {
#[must_use]
pub fn new(max_sentences: usize, overlap_sentences: usize) -> Self {
Self { max_sentences, overlap_sentences }
}
fn split_sentences(text: &str) -> Vec<&str> {
let mut sentences = Vec::new();
let mut start = 0;
for (i, c) in text.char_indices() {
if c == '.' || c == '!' || c == '?' {
let next_char = text[i + c.len_utf8()..].chars().next();
if next_char.map_or(true, |nc| nc.is_whitespace() || nc.is_uppercase()) {
let end = i + c.len_utf8();
let sentence = text[start..end].trim();
if !sentence.is_empty() {
sentences.push(sentence);
}
start = end;
}
}
}
let remaining = text[start..].trim();
if !remaining.is_empty() {
sentences.push(remaining);
}
sentences
}
}
impl Chunker for SentenceChunker {
fn chunk(&self, document: &Document) -> Result<Vec<Chunk>> {
if document.content.is_empty() {
return Err(Error::EmptyDocument(
document.title.clone().unwrap_or_else(|| "untitled".to_string()),
));
}
let sentences = Self::split_sentences(&document.content);
let mut chunks = Vec::new();
let mut i = 0;
while i < sentences.len() {
let end = (i + self.max_sentences).min(sentences.len());
let content = sentences[i..end].join(" ");
let start_offset = document.content.find(&content).unwrap_or(0);
let end_offset = start_offset + content.len();
let mut chunk = Chunk::new(document.id, content, start_offset, end_offset);
chunk.metadata.title = document.title.clone();
chunks.push(chunk);
let step = self.max_sentences.saturating_sub(self.overlap_sentences);
i += if step == 0 { 1 } else { step };
}
Ok(chunks)
}
fn estimate_chunks(&self, document: &Document) -> usize {
if document.content.is_empty() {
return 0;
}
let sentences = Self::split_sentences(&document.content);
let step = self.max_sentences.saturating_sub(self.overlap_sentences);
if step == 0 {
return sentences.len();
}
(sentences.len() + step - 1) / step
}
}
#[cfg(test)]
mod tests;